]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/waitq.c
xnu-3248.20.55.tar.gz
[apple/xnu.git] / osfmk / kern / waitq.c
1 /*
2 * Copyright (c) 2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_FREE_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 #include <kern/ast.h>
57 #include <kern/kern_types.h>
58 #include <kern/mach_param.h>
59 #include <kern/queue.h>
60 #include <kern/sched_prim.h>
61 #include <kern/simple_lock.h>
62 #include <kern/spl.h>
63 #include <kern/waitq.h>
64 #include <kern/zalloc.h>
65 #include <libkern/OSAtomic.h>
66 #include <mach/sync_policy.h>
67 #include <vm/vm_kern.h>
68
69 #include <sys/kdebug.h>
70
71 #if CONFIG_WAITQ_DEBUG
72 #define wqdbg(fmt,...) \
73 printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__)
74 #else
75 #define wqdbg(fmt,...) do { } while (0)
76 #endif
77
78 #ifdef WAITQ_VERBOSE_DEBUG
79 #define wqdbg_v(fmt,...) \
80 printf("WQ[v:%s]: " fmt "\n", __func__, ## __VA_ARGS__)
81 #else
82 #define wqdbg_v(fmt,...) do { } while (0)
83 #endif
84
85 #define wqinfo(fmt,...) \
86 printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__)
87
88 #define wqerr(fmt,...) \
89 printf("WQ[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__)
90
91
92 /*
93 * un-comment the following lines to debug the link/prepost tables
94 * NOTE: this expands each element by ~40 bytes
95 */
96 //#define CONFIG_WAITQ_LINK_STATS
97 //#define CONFIG_WAITQ_PREPOST_STATS
98
99 /*
100 * file-static functions / data
101 */
102 static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,
103 uint64_t *reserved_preposts,
104 int priority, spl_t *spl);
105
106 static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
107 event64_t event,
108 thread_t thread, spl_t *spl);
109
110 #define WAITQ_SET_MAX (task_max * 3)
111 static zone_t waitq_set_zone;
112
113
114 #define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
115 #define ROUNDDOWN(x,y) (((x)/(y))*(y))
116
117
118 #ifdef CONFIG_WAITQ_STATS
119 static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
120 #endif
121
122
123 /* ----------------------------------------------------------------------
124 *
125 * Wait Queue Link/Prepost Table Implementation
126 *
127 * ---------------------------------------------------------------------- */
128 #define DEFAULT_MIN_FREE_TABLE_ELEM 100
129 static uint32_t g_min_free_table_elem;
130 static uint32_t g_min_free_cache;
131
132 static vm_size_t g_wqt_max_tbl_size;
133 static lck_grp_t g_wqt_lck_grp;
134
135 /* 1 prepost table, 1 setid link table */
136 #define NUM_WQ_TABLES 2
137
138 /* default VA space for waitq tables (zone allocated) */
139 #define DEFAULT_MAX_TABLE_SIZE P2ROUNDUP(8 * 1024 * 1024, PAGE_SIZE)
140
141 struct wq_id {
142 union {
143 uint64_t id;
144 struct {
145 /*
146 * this bitfied is OK because we don't need to
147 * enforce a particular memory layout
148 */
149 uint64_t idx:18, /* allows indexing up to 8MB of 32byte link objects */
150 generation:46;
151 };
152 };
153 };
154
155 enum wqt_elem_type {
156 WQT_FREE = 0,
157 WQT_ELEM = 1,
158 WQT_LINK = 2,
159 WQT_RESERVED = 3,
160 };
161
162 struct wqt_elem {
163 uint32_t wqt_bits;
164
165 uint32_t wqt_next_idx;
166
167 struct wq_id wqt_id;
168 };
169
170 /* this _must_ match the idx bitfield definition in struct wq_id */
171 #define WQT_IDX_MAX (0x3ffff)
172 #if defined(DEVELOPMENT) || defined(DEBUG)
173 /* global for lldb macros */
174 uint64_t g_wqt_idx_max = WQT_IDX_MAX;
175 #endif
176
177 /* reference count bits should _always_ be the low-order bits */
178 #define WQT_BITS_REFCNT_MASK (0x1FFFFFFF)
179 #define WQT_BITS_REFCNT_SHIFT (0)
180 #define WQT_BITS_REFCNT (WQT_BITS_REFCNT_MASK << WQT_BITS_REFCNT_SHIFT)
181
182 #define WQT_BITS_TYPE_MASK (0x3)
183 #define WQT_BITS_TYPE_SHIFT (29)
184 #define WQT_BITS_TYPE (WQT_BITS_TYPE_MASK << WQT_BITS_TYPE_SHIFT)
185
186 #define WQT_BITS_VALID_MASK (0x1)
187 #define WQT_BITS_VALID_SHIFT (31)
188 #define WQT_BITS_VALID (WQT_BITS_VALID_MASK << WQT_BITS_VALID_SHIFT)
189
190 #define wqt_bits_refcnt(bits) \
191 (((bits) >> WQT_BITS_REFCNT_SHIFT) & WQT_BITS_REFCNT_MASK)
192
193 #define wqt_bits_type(bits) \
194 (((bits) >> WQT_BITS_TYPE_SHIFT) & WQT_BITS_TYPE_MASK)
195
196 #define wqt_bits_valid(bits) \
197 ((bits) & WQT_BITS_VALID)
198
199 struct wq_table;
200 typedef void (*wq_table_poison_func)(struct wq_table *, struct wqt_elem *);
201
202 /*
203 * A table is a container for slabs of elements. Each slab is 'slab_sz' bytes
204 * and contains 'slab_sz/elem_sz' elements (of 'elem_sz' bytes each). These
205 * slabs allow the table to be broken up into potentially dis-contiguous VA
206 * space. On 32-bit platforms with large amounts of physical RAM, this is
207 * quite important. Keeping slabs like this slightly complicates retrieval of
208 * table elements, but not by much.
209 */
210 struct wq_table {
211 struct wqt_elem **table; /* an array of 'slabs' of elements */
212 struct wqt_elem **next_free_slab;
213 struct wq_id free_list __attribute__((aligned(8)));
214
215 uint32_t nelem;
216 uint32_t used_elem;
217 uint32_t elem_sz; /* size of a table element (bytes) */
218
219 uint32_t slab_sz; /* size of a table 'slab' object (bytes) */
220 uint32_t slab_shift;
221 uint32_t slab_msk;
222 uint32_t slab_elem;
223 zone_t slab_zone;
224
225 wq_table_poison_func poison;
226
227 lck_mtx_t lock;
228 uint32_t state;
229
230 #if CONFIG_WAITQ_STATS
231 uint32_t nslabs;
232
233 uint64_t nallocs;
234 uint64_t nreallocs;
235 uint64_t npreposts;
236 int64_t nreservations;
237 uint64_t nreserved_releases;
238 uint64_t nspins;
239
240 uint64_t max_used;
241 uint64_t avg_used;
242 uint64_t max_reservations;
243 uint64_t avg_reservations;
244 #endif
245 } __attribute__((aligned(8)));
246
247 #define wqt_elem_ofst_slab(slab, slab_msk, ofst) \
248 /* cast through 'void *' to avoid compiler alignment warning messages */ \
249 ((struct wqt_elem *)((void *)((uintptr_t)(slab) + ((ofst) & (slab_msk)))))
250
251 #if defined(CONFIG_WAITQ_LINK_STATS) || defined(CONFIG_WAITQ_PREPOST_STATS)
252 /* version that makes no assumption on waste within a slab */
253 static inline struct wqt_elem *
254 wqt_elem_idx(struct wq_table *table, uint32_t idx)
255 {
256 int slab_idx = idx / table->slab_elem;
257 struct wqt_elem *slab = table->table[slab_idx];
258 if (!slab)
259 panic("Invalid index:%d slab:%d (NULL) for table:%p\n",
260 idx, slab_idx, table);
261 assert(slab->wqt_id.idx <= idx && (slab->wqt_id.idx + table->slab_elem) > idx);
262 return wqt_elem_ofst_slab(slab, table->slab_msk, (idx - slab->wqt_id.idx) * table->elem_sz);
263 }
264 #else /* !CONFIG_WAITQ_[LINK|PREPOST]_STATS */
265 /* verion that assumes 100% ultilization of slabs (no waste) */
266 static inline struct wqt_elem *
267 wqt_elem_idx(struct wq_table *table, uint32_t idx)
268 {
269 uint32_t ofst = idx * table->elem_sz;
270 struct wqt_elem *slab = table->table[ofst >> table->slab_shift];
271 if (!slab)
272 panic("Invalid index:%d slab:%d (NULL) for table:%p\n",
273 idx, (ofst >> table->slab_shift), table);
274 assert(slab->wqt_id.idx <= idx && (slab->wqt_id.idx + table->slab_elem) > idx);
275 return wqt_elem_ofst_slab(slab, table->slab_msk, ofst);
276 }
277 #endif /* !CONFIG_WAITQ_[LINK|PREPOST]_STATS */
278
279 static int __assert_only wqt_elem_in_range(struct wqt_elem *elem,
280 struct wq_table *table)
281 {
282 struct wqt_elem **base = table->table;
283 uintptr_t e = (uintptr_t)elem;
284 assert(base != NULL);
285 while (*base != NULL) {
286 uintptr_t b = (uintptr_t)(*base);
287 if (e >= b && e < b + table->slab_sz)
288 return 1;
289 base++;
290 if ((uintptr_t)base >= (uintptr_t)table->table + PAGE_SIZE)
291 return 0;
292 }
293 return 0;
294 }
295
296 static struct wqt_elem *wq_table_get_elem(struct wq_table *table, uint64_t id);
297 static void wq_table_put_elem(struct wq_table *table, struct wqt_elem *elem);
298 static int wqt_elem_list_link(struct wq_table *table, struct wqt_elem *parent,
299 struct wqt_elem *child);
300
301 static void wqt_elem_invalidate(struct wqt_elem *elem)
302 {
303 uint32_t __assert_only old = OSBitAndAtomic(~WQT_BITS_VALID, &elem->wqt_bits);
304 OSMemoryBarrier();
305 assert(((wqt_bits_type(old) != WQT_RESERVED) && (old & WQT_BITS_VALID)) ||
306 ((wqt_bits_type(old) == WQT_RESERVED) && !(old & WQT_BITS_VALID)));
307 }
308
309 static void wqt_elem_mkvalid(struct wqt_elem *elem)
310 {
311 uint32_t __assert_only old = OSBitOrAtomic(WQT_BITS_VALID, &elem->wqt_bits);
312 OSMemoryBarrier();
313 assert(!(old & WQT_BITS_VALID));
314 }
315
316 static void wqt_elem_set_type(struct wqt_elem *elem, int type)
317 {
318 uint32_t old_bits, new_bits;
319 do {
320 old_bits = elem->wqt_bits;
321 new_bits = (old_bits & ~WQT_BITS_TYPE) |
322 ((type & WQT_BITS_TYPE_MASK) << WQT_BITS_TYPE_SHIFT);
323 } while (OSCompareAndSwap(old_bits, new_bits, &elem->wqt_bits) == FALSE);
324 OSMemoryBarrier();
325 }
326
327
328 static void wq_table_bootstrap(void)
329 {
330 uint32_t tmp32 = 0;
331
332 g_min_free_cache = 0;
333 g_min_free_table_elem = DEFAULT_MIN_FREE_TABLE_ELEM;
334 if (PE_parse_boot_argn("wqt_min_free", &tmp32, sizeof(tmp32)) == TRUE)
335 g_min_free_table_elem = tmp32;
336 wqdbg("Minimum free table elements: %d", tmp32);
337
338 g_wqt_max_tbl_size = DEFAULT_MAX_TABLE_SIZE;
339 if (PE_parse_boot_argn("wqt_tbl_size", &tmp32, sizeof(tmp32)) == TRUE)
340 g_wqt_max_tbl_size = (vm_size_t)P2ROUNDUP(tmp32, PAGE_SIZE);
341
342 lck_grp_init(&g_wqt_lck_grp, "waitq_table_locks", LCK_GRP_ATTR_NULL);
343 }
344
345 static void wq_table_init(struct wq_table *table, const char *name,
346 uint32_t max_tbl_elem, uint32_t elem_sz,
347 wq_table_poison_func poison)
348 {
349 kern_return_t kr;
350 uint32_t slab_sz, slab_shift, slab_msk, slab_elem;
351 zone_t slab_zone;
352 size_t max_tbl_sz;
353 struct wqt_elem *e, **base;
354
355 /*
356 * First, allocate a single page of memory to act as the base
357 * for the table's element slabs
358 */
359 kr = kernel_memory_allocate(kernel_map, (vm_offset_t *)&base,
360 PAGE_SIZE, 0, KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
361 if (kr != KERN_SUCCESS)
362 panic("Cannot initialize %s table: "
363 "kernel_memory_allocate failed:%d\n", name, kr);
364 memset(base, 0, PAGE_SIZE);
365
366 /*
367 * Based on the maximum table size, calculate the slab size:
368 * we allocate 1 page of slab pointers for the table, and we need to
369 * index elements of 'elem_sz', this gives us the slab size based on
370 * the maximum size the table should grow.
371 */
372 max_tbl_sz = (max_tbl_elem * elem_sz);
373 max_tbl_sz = P2ROUNDUP(max_tbl_sz, PAGE_SIZE);
374
375 /* system maximum table size divided by number of slots in a page */
376 slab_sz = (uint32_t)(max_tbl_sz / (PAGE_SIZE / (sizeof(void *))));
377 if (slab_sz < PAGE_SIZE)
378 slab_sz = PAGE_SIZE;
379
380 /* make sure the slab size is a power of two */
381 slab_shift = 0;
382 slab_msk = ~0;
383 for (uint32_t i = 0; i < 31; i++) {
384 uint32_t bit = (1 << i);
385 if ((slab_sz & bit) == slab_sz) {
386 slab_shift = i;
387 slab_msk = 0;
388 for (uint32_t j = 0; j < i; j++)
389 slab_msk |= (1 << j);
390 break;
391 }
392 slab_sz &= ~bit;
393 }
394 slab_elem = slab_sz / elem_sz;
395
396 /* initialize the table's slab zone (for table growth) */
397 wqdbg("Initializing %s zone: slab:%d (%d,0x%x) max:%ld",
398 name, slab_sz, slab_shift, slab_msk, max_tbl_sz);
399 slab_zone = zinit(slab_sz, max_tbl_sz, slab_sz, name);
400 assert(slab_zone != ZONE_NULL);
401
402 /* allocate the first slab and populate it */
403 base[0] = (struct wqt_elem *)zalloc(slab_zone);
404 if (base[0] == NULL)
405 panic("Can't allocate a %s table slab from zone:%p",
406 name, slab_zone);
407
408 memset(base[0], 0, slab_sz);
409
410 /* setup the initial freelist */
411 wqdbg("initializing %d links (%d bytes each)...", slab_elem, elem_sz);
412 for (unsigned l = 0; l < slab_elem; l++) {
413 e = wqt_elem_ofst_slab(base[0], slab_msk, l * elem_sz);
414 e->wqt_id.idx = l;
415 /*
416 * setting generation to 0 ensures that a setid of 0 is
417 * invalid because the generation will be incremented before
418 * each element's allocation.
419 */
420 e->wqt_id.generation = 0;
421 e->wqt_next_idx = l + 1;
422 }
423
424 /* make sure the last free element points to a never-valid idx */
425 e = wqt_elem_ofst_slab(base[0], slab_msk, (slab_elem - 1) * elem_sz);
426 e->wqt_next_idx = WQT_IDX_MAX;
427
428 lck_mtx_init(&table->lock, &g_wqt_lck_grp, LCK_ATTR_NULL);
429
430 table->slab_sz = slab_sz;
431 table->slab_shift = slab_shift;
432 table->slab_msk = slab_msk;
433 table->slab_elem = slab_elem;
434 table->slab_zone = slab_zone;
435
436 table->elem_sz = elem_sz;
437 table->nelem = slab_elem;
438 table->used_elem = 0;
439 table->elem_sz = elem_sz;
440 table->poison = poison;
441
442 table->table = base;
443 table->next_free_slab = &base[1];
444 table->free_list.id = base[0]->wqt_id.id;
445
446 #if CONFIG_WAITQ_STATS
447 table->nslabs = 1;
448 table->nallocs = 0;
449 table->nreallocs = 0;
450 table->npreposts = 0;
451 table->nreservations = 0;
452 table->nreserved_releases = 0;
453
454 table->max_used = 0;
455 table->avg_used = 0;
456 table->max_reservations = 0;
457 table->avg_reservations = 0;
458 #endif
459 }
460
461 /**
462 * grow a waitq table by adding another 'slab' of table elements
463 *
464 * Conditions:
465 * table mutex is unlocked
466 * calling thread can block
467 */
468 static void wq_table_grow(struct wq_table *table, uint32_t min_free)
469 {
470 struct wqt_elem *slab, **slot;
471 struct wqt_elem *e = NULL, *first_new_elem, *last_new_elem;
472 struct wq_id free_id;
473 uint32_t free_elem;
474
475 assert(get_preemption_level() == 0);
476 assert(table && table->slab_zone);
477
478 lck_mtx_lock(&table->lock);
479
480 free_elem = table->nelem - table->used_elem;
481
482 /*
483 * If the caller just wanted to ensure a minimum number of elements,
484 * do that (and don't just blindly grow the table). Also, don't grow
485 * the table unnecessarily - we could have been beaten by a higher
486 * priority thread who acquired the lock and grew the table before we
487 * got here.
488 */
489 if (free_elem > min_free) {
490 lck_mtx_unlock(&table->lock);
491 return;
492 }
493
494 /* we are now committed to table growth */
495 wqdbg_v("BEGIN");
496
497 if (table->next_free_slab == NULL) {
498 /*
499 * before we panic, check one more time to see if any other
500 * threads have free'd from space in the table.
501 */
502 if ((table->nelem - table->used_elem) > 0) {
503 /* there's at least 1 free element: don't panic yet */
504 lck_mtx_unlock(&table->lock);
505 return;
506 }
507 panic("No more room to grow table: %p (nelem: %d, used: %d)",
508 table, table->nelem, table->used_elem);
509 }
510 slot = table->next_free_slab;
511 table->next_free_slab++;
512 if ((uintptr_t)table->next_free_slab >= (uintptr_t)table->table + PAGE_SIZE)
513 table->next_free_slab = NULL;
514
515 assert(*slot == NULL);
516
517 /* allocate another slab */
518 slab = (struct wqt_elem *)zalloc(table->slab_zone);
519 if (slab == NULL)
520 panic("Can't allocate a %s table (%p) slab from zone:%p",
521 table->slab_zone->zone_name, table, table->slab_zone);
522
523 memset(slab, 0, table->slab_sz);
524
525 /* put the new elements into a freelist */
526 wqdbg_v(" init %d new links...", table->slab_elem);
527 for (unsigned l = 0; l < table->slab_elem; l++) {
528 uint32_t idx = l + table->nelem;
529 if (idx >= (WQT_IDX_MAX - 1))
530 break; /* the last element of the last slab */
531 e = wqt_elem_ofst_slab(slab, table->slab_msk, l * table->elem_sz);
532 e->wqt_id.idx = idx;
533 e->wqt_next_idx = idx + 1;
534 }
535 last_new_elem = e;
536 assert(last_new_elem != NULL);
537
538 first_new_elem = wqt_elem_ofst_slab(slab, table->slab_msk, 0);
539
540 /* update table book keeping, and atomically swap the freelist head */
541 *slot = slab;
542 if (table->nelem + table->slab_elem >= WQT_IDX_MAX)
543 table->nelem = WQT_IDX_MAX - 1;
544 else
545 table->nelem += table->slab_elem;
546
547 #if CONFIG_WAITQ_STATS
548 table->nslabs += 1;
549 #endif
550
551 /*
552 * The atomic swap of the free list head marks the end of table
553 * growth. Incoming requests may now use the newly allocated slab
554 * of table elements
555 */
556 free_id = table->free_list;
557 /* connect the existing free list to the end of the new free list */
558 last_new_elem->wqt_next_idx = free_id.idx;
559 while (OSCompareAndSwap64(free_id.id, first_new_elem->wqt_id.id,
560 &table->free_list.id) == FALSE) {
561 OSMemoryBarrier();
562 free_id = table->free_list;
563 last_new_elem->wqt_next_idx = free_id.idx;
564 }
565 OSMemoryBarrier();
566
567 lck_mtx_unlock(&table->lock);
568
569 return;
570 }
571
572 static __attribute__((noinline))
573 struct wqt_elem *wq_table_alloc_elem(struct wq_table *table, int type, int nelem)
574 {
575 int nspins = 0, ntries = 0, nalloc = 0;
576 uint32_t table_size;
577 struct wqt_elem *elem = NULL;
578 struct wq_id free_id, next_id;
579
580 static const int max_retries = 500;
581
582 if (type != WQT_ELEM && type != WQT_LINK && type != WQT_RESERVED)
583 panic("wq_table_aloc of invalid elem type:%d from table @%p",
584 type, table);
585
586 assert(nelem > 0);
587
588 try_again:
589 elem = NULL;
590 if (ntries++ > max_retries) {
591 struct wqt_elem *tmp;
592 if (table->used_elem + nelem >= table_size)
593 panic("No more room to grow table: 0x%p size:%d, used:%d, requested elem:%d",
594 table, table_size, table->used_elem, nelem);
595 if (nelem == 1)
596 panic("Too many alloc retries: %d, table:%p, type:%d, nelem:%d",
597 ntries, table, type, nelem);
598 /* don't panic: try allocating one-at-a-time */
599 while (nelem > 0) {
600 tmp = wq_table_alloc_elem(table, type, 1);
601 if (elem)
602 wqt_elem_list_link(table, tmp, elem);
603 elem = tmp;
604 --nelem;
605 }
606 assert(elem != NULL);
607 return elem;
608 }
609
610 nalloc = 0;
611 table_size = table->nelem;
612
613 if (table->used_elem + nelem >= table_size) {
614 if (get_preemption_level() != 0) {
615 #if CONFIG_WAITQ_STATS
616 table->nspins += 1;
617 #endif
618 /*
619 * We may have just raced with table growth: check
620 * again to make sure there really isn't any space.
621 */
622 if (++nspins > 4)
623 panic("Can't grow table %p with preemption"
624 " disabled!", table);
625 delay(1);
626 goto try_again;
627 }
628 wq_table_grow(table, nelem);
629 goto try_again;
630 }
631
632 /* read this value only once before the CAS */
633 free_id = table->free_list;
634 if (free_id.idx >= table_size)
635 goto try_again;
636
637 /*
638 * Find the item on the free list which will become the new free list
639 * head, but be careful not to modify any memory (read only)! Other
640 * threads can alter table state at any time up until the CAS. We
641 * don't modify any memory until we've successfully swapped out the
642 * free list head with the one we've investigated.
643 */
644 for (struct wqt_elem *next_elem = wqt_elem_idx(table, free_id.idx);
645 nalloc < nelem;
646 nalloc++) {
647 elem = next_elem;
648 next_id.generation = 0;
649 next_id.idx = next_elem->wqt_next_idx;
650 if (next_id.idx < table->nelem) {
651 next_elem = wqt_elem_idx(table, next_id.idx);
652 next_id.id = next_elem->wqt_id.id;
653 } else {
654 goto try_again;
655 }
656 }
657 /* 'elem' points to the last element being allocated */
658
659 if (OSCompareAndSwap64(free_id.id, next_id.id,
660 &table->free_list.id) == FALSE)
661 goto try_again;
662
663 /* load barrier */
664 OSMemoryBarrier();
665
666 /*
667 * After the CAS, we know that we own free_id, and it points to a
668 * valid table entry (checked above). Grab the table pointer and
669 * reset some values.
670 */
671 OSAddAtomic(nelem, &table->used_elem);
672
673 /* end the list of allocated elements */
674 elem->wqt_next_idx = WQT_IDX_MAX;
675 /* reset 'elem' to point to the first allocated element */
676 elem = wqt_elem_idx(table, free_id.idx);
677
678 /*
679 * Update the generation count, and return the element(s)
680 * with a single reference (and no valid bit). If the
681 * caller immediately calls _put() on any element, then
682 * it will be released back to the free list. If the caller
683 * subsequently marks the element as valid, then the put
684 * will simply drop the reference.
685 */
686 for (struct wqt_elem *tmp = elem; ; ) {
687 assert(!wqt_bits_valid(tmp->wqt_bits) &&
688 (wqt_bits_refcnt(tmp->wqt_bits) == 0));
689 --nalloc;
690 tmp->wqt_id.generation += 1;
691 tmp->wqt_bits = 1;
692 wqt_elem_set_type(tmp, type);
693 if (tmp->wqt_next_idx == WQT_IDX_MAX)
694 break;
695 assert(tmp->wqt_next_idx != WQT_IDX_MAX);
696 tmp = wqt_elem_idx(table, tmp->wqt_next_idx);
697 }
698 assert(nalloc == 0);
699
700 #if CONFIG_WAITQ_STATS
701 uint64_t nreservations;
702 table->nallocs += nelem;
703 if (type == WQT_RESERVED)
704 OSIncrementAtomic64(&table->nreservations);
705 nreservations = table->nreservations;
706 if (table->used_elem > table->max_used)
707 table->max_used = table->used_elem;
708 if (nreservations > table->max_reservations)
709 table->max_reservations = nreservations;
710 table->avg_used = (table->avg_used + table->used_elem) / 2;
711 table->avg_reservations = (table->avg_reservations + nreservations) / 2;
712 #endif
713
714 return elem;
715 }
716
717 static void wq_table_realloc_elem(struct wq_table *table, struct wqt_elem *elem, int type)
718 {
719 (void)table;
720 assert(wqt_elem_in_range(elem, table) &&
721 !wqt_bits_valid(elem->wqt_bits));
722
723 #if CONFIG_WAITQ_STATS
724 table->nreallocs += 1;
725 if (wqt_bits_type(elem->wqt_bits) == WQT_RESERVED && type != WQT_RESERVED) {
726 /*
727 * This isn't under any lock, so we'll clamp it.
728 * the stats are meant to be informative, not perfectly
729 * accurate
730 */
731 OSDecrementAtomic64(&table->nreservations);
732 }
733 table->avg_reservations = (table->avg_reservations + table->nreservations) / 2;
734 #endif
735
736 /*
737 * Return the same element with a new generation count, and a
738 * (potentially) new type. Don't touch the refcount: the caller
739 * is responsible for getting that (and the valid bit) correct.
740 */
741 elem->wqt_id.generation += 1;
742 elem->wqt_next_idx = WQT_IDX_MAX;
743 wqt_elem_set_type(elem, type);
744
745 return;
746 }
747
748 static void wq_table_free_elem(struct wq_table *table, struct wqt_elem *elem)
749 {
750 struct wq_id next_id;
751
752 assert(wqt_elem_in_range(elem, table) &&
753 !wqt_bits_valid(elem->wqt_bits) &&
754 (wqt_bits_refcnt(elem->wqt_bits) == 0));
755
756 OSDecrementAtomic(&table->used_elem);
757
758 #if CONFIG_WAITQ_STATS
759 table->avg_used = (table->avg_used + table->used_elem) / 2;
760 if (wqt_bits_type(elem->wqt_bits) == WQT_RESERVED)
761 OSDecrementAtomic64(&table->nreservations);
762 table->avg_reservations = (table->avg_reservations + table->nreservations) / 2;
763 #endif
764
765 elem->wqt_bits = 0;
766
767 if (table->poison)
768 (table->poison)(table, elem);
769
770 again:
771 next_id = table->free_list;
772 if (next_id.idx >= table->nelem)
773 elem->wqt_next_idx = WQT_IDX_MAX;
774 else
775 elem->wqt_next_idx = next_id.idx;
776
777 /* store barrier */
778 OSMemoryBarrier();
779 if (OSCompareAndSwap64(next_id.id, elem->wqt_id.id,
780 &table->free_list.id) == FALSE)
781 goto again;
782 }
783
784 /* get a reference to a table element identified by 'id' */
785 static struct wqt_elem *wq_table_get_elem(struct wq_table *table, uint64_t id)
786 {
787 struct wqt_elem *elem;
788 uint32_t idx, bits, new_bits;
789
790 /*
791 * Here we have a reference to the table which is guaranteed to remain
792 * valid until we drop the reference
793 */
794
795 idx = ((struct wq_id *)&id)->idx;
796
797 if (idx >= table->nelem)
798 panic("id:0x%llx : idx:%d > %d", id, idx, table->nelem);
799
800 elem = wqt_elem_idx(table, idx);
801
802 /* verify the validity by taking a reference on the table object */
803 bits = elem->wqt_bits;
804 if (!wqt_bits_valid(bits))
805 return NULL;
806
807 /*
808 * do a pre-verify on the element ID to potentially
809 * avoid 2 compare-and-swaps
810 */
811 if (elem->wqt_id.id != id)
812 return NULL;
813
814 new_bits = bits + 1;
815
816 /* check for overflow */
817 assert(wqt_bits_refcnt(new_bits) > 0);
818
819 while (OSCompareAndSwap(bits, new_bits, &elem->wqt_bits) == FALSE) {
820 /*
821 * either the element became invalid,
822 * or someone else grabbed/removed a reference.
823 */
824 bits = elem->wqt_bits;
825 if (!wqt_bits_valid(bits)) {
826 /* don't return invalid elements */
827 return NULL;
828 }
829 new_bits = bits + 1;
830 assert(wqt_bits_refcnt(new_bits) > 0);
831 }
832
833 /* load barrier */
834 OSMemoryBarrier();
835
836 /* check to see that our reference is to the same generation! */
837 if (elem->wqt_id.id != id) {
838 /*
839 wqdbg("ID:0x%llx table generation (%d) != %d",
840 id, elem->wqt_id.generation,
841 ((struct wq_id *)&id)->generation);
842 */
843 wq_table_put_elem(table, elem);
844 return NULL;
845 }
846
847 /* We now have a reference on a valid object */
848 return elem;
849 }
850
851 /* release a ref to table element - puts it back on free list as appropriate */
852 static void wq_table_put_elem(struct wq_table *table, struct wqt_elem *elem)
853 {
854 uint32_t bits, new_bits;
855
856 assert(wqt_elem_in_range(elem, table));
857
858 bits = elem->wqt_bits;
859 new_bits = bits - 1;
860
861 /* check for underflow */
862 assert(wqt_bits_refcnt(new_bits) < WQT_BITS_REFCNT_MASK);
863
864 while (OSCompareAndSwap(bits, new_bits, &elem->wqt_bits) == FALSE) {
865 bits = elem->wqt_bits;
866 new_bits = bits - 1;
867 /* catch underflow */
868 assert(wqt_bits_refcnt(new_bits) < WQT_BITS_REFCNT_MASK);
869 }
870
871 /* load barrier */
872 OSMemoryBarrier();
873
874 /*
875 * if this was the last reference, and it was marked as invalid,
876 * then we can add this link object back to the free list
877 */
878 if (!wqt_bits_valid(new_bits) && (wqt_bits_refcnt(new_bits) == 0))
879 wq_table_free_elem(table, elem);
880
881 return;
882 }
883
884
885 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
886 *
887 * API: wqt_elem_list_...
888 *
889 * Reuse the free list linkage member, 'wqt_next_idx' of a table element
890 * in a slightly more generic singly-linked list. All members of this
891 * list have been allocated from a table, but have not been made valid.
892 *
893 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
894
895 /* link parent->child */
896 static int wqt_elem_list_link(struct wq_table *table, struct wqt_elem *parent, struct wqt_elem *child)
897 {
898 int nelem = 1;
899
900 assert(wqt_elem_in_range(parent, table));
901
902 /* find the end of the parent's list */
903 while (parent->wqt_next_idx != WQT_IDX_MAX) {
904 assert(parent->wqt_next_idx < table->nelem);
905 parent = wqt_elem_idx(table, parent->wqt_next_idx);
906 nelem++;
907 }
908
909 if (child) {
910 assert(wqt_elem_in_range(child, table));
911 parent->wqt_next_idx = child->wqt_id.idx;
912 }
913
914 return nelem;
915 }
916
917 static struct wqt_elem *wqt_elem_list_next(struct wq_table *table, struct wqt_elem *head)
918 {
919 struct wqt_elem *elem;
920
921 if (!head)
922 return NULL;
923 if (head->wqt_next_idx >= table->nelem)
924 return NULL;
925
926 elem = wqt_elem_idx(table, head->wqt_next_idx);
927 assert(wqt_elem_in_range(elem, table));
928
929 return elem;
930 }
931
932 /*
933 * Obtain a pointer to the first element of a list. Don't take an extra
934 * reference on the object - the list implicitly holds that reference.
935 *
936 * This function is used to convert the head of a singly-linked list
937 * to a real wqt_elem object.
938 */
939 static struct wqt_elem *wqt_elem_list_first(struct wq_table *table, uint64_t id)
940 {
941 uint32_t idx;
942 struct wqt_elem *elem = NULL;
943
944 if (id == 0)
945 return NULL;
946
947 idx = ((struct wq_id *)&id)->idx;
948
949 if (idx > table->nelem)
950 panic("Invalid element for id:0x%llx", id);
951 elem = wqt_elem_idx(table, idx);
952
953 /* invalid element: reserved ID was probably already reallocated */
954 if (elem->wqt_id.id != id)
955 return NULL;
956
957 /* the returned element should _not_ be marked valid! */
958 if (wqt_bits_valid(elem->wqt_bits) ||
959 wqt_bits_type(elem->wqt_bits) != WQT_RESERVED ||
960 wqt_bits_refcnt(elem->wqt_bits) != 1) {
961 panic("Valid/unreserved element %p (0x%x) in reserved list",
962 elem, elem->wqt_bits);
963 }
964
965 return elem;
966 }
967
968 static void wqt_elem_reset_next(struct wq_table *table, struct wqt_elem *wqp)
969 {
970 (void)table;
971
972 if (!wqp)
973 return;
974 assert(wqt_elem_in_range(wqp, table));
975
976 wqp->wqt_next_idx = WQT_IDX_MAX;
977 }
978
979 /*
980 * Pop an item off the list.
981 * New list head returned in *id, caller responsible for reference on returned
982 * object. We do a realloc here to reset the type of the object, but still
983 * leave it invalid.
984 */
985 static struct wqt_elem *wqt_elem_list_pop(struct wq_table *table, uint64_t *id, int type)
986 {
987 struct wqt_elem *first, *next;
988
989 if (!id || *id == 0)
990 return NULL;
991
992 /* pop an item off the reserved stack */
993
994 first = wqt_elem_list_first(table, *id);
995 if (!first) {
996 *id = 0;
997 return NULL;
998 }
999
1000 next = wqt_elem_list_next(table, first);
1001 if (next)
1002 *id = next->wqt_id.id;
1003 else
1004 *id = 0;
1005
1006 wq_table_realloc_elem(table, first, type);
1007
1008 return first;
1009 }
1010
1011 /*
1012 * Free an entire list of linked/reserved elements
1013 */
1014 static int wqt_elem_list_release(struct wq_table *table,
1015 struct wqt_elem *head,
1016 int __assert_only type)
1017 {
1018 struct wqt_elem *elem;
1019 struct wq_id free_id;
1020 int nelem = 0;
1021
1022 if (!head)
1023 return 0;
1024
1025 for (elem = head; ; ) {
1026 assert(wqt_elem_in_range(elem, table));
1027 assert(!wqt_bits_valid(elem->wqt_bits) && (wqt_bits_refcnt(elem->wqt_bits) == 1));
1028 assert(wqt_bits_type(elem->wqt_bits) == type);
1029
1030 nelem++;
1031 elem->wqt_bits = 0;
1032 if (table->poison)
1033 (table->poison)(table, elem);
1034
1035 if (elem->wqt_next_idx == WQT_IDX_MAX)
1036 break;
1037 assert(elem->wqt_next_idx < table->nelem);
1038 elem = wqt_elem_idx(table, elem->wqt_next_idx);
1039 }
1040
1041 /*
1042 * 'elem' now points to the end of our list, and 'head' points to the
1043 * beginning. We want to atomically swap the free list pointer with
1044 * the 'head' and ensure that 'elem' points to the previous free list
1045 * head.
1046 */
1047
1048 again:
1049 free_id = table->free_list;
1050 if (free_id.idx >= table->nelem)
1051 elem->wqt_next_idx = WQT_IDX_MAX;
1052 else
1053 elem->wqt_next_idx = free_id.idx;
1054
1055 /* store barrier */
1056 OSMemoryBarrier();
1057 if (OSCompareAndSwap64(free_id.id, head->wqt_id.id,
1058 &table->free_list.id) == FALSE)
1059 goto again;
1060
1061 OSAddAtomic(-nelem, &table->used_elem);
1062 return nelem;
1063 }
1064
1065
1066 /* ----------------------------------------------------------------------
1067 *
1068 * SetID Link Table Implementation
1069 *
1070 * ---------------------------------------------------------------------- */
1071 static struct wq_table g_linktable;
1072
1073 enum setid_link_type {
1074 SLT_ALL = -1,
1075 SLT_FREE = WQT_FREE,
1076 SLT_WQS = WQT_ELEM,
1077 SLT_LINK = WQT_LINK,
1078 };
1079
1080 struct setid_link {
1081 struct wqt_elem wqte;
1082
1083 union {
1084 /* wqt_type == SLT_WQS (WQT_ELEM) */
1085 struct {
1086 struct waitq_set *sl_set;
1087 /* uint64_t sl_prepost_id; */
1088 } sl_wqs;
1089
1090 /* wqt_type == SLT_LINK (WQT_LINK) */
1091 struct {
1092 uint64_t sl_left_setid;
1093 uint64_t sl_right_setid;
1094 } sl_link;
1095 };
1096 #ifdef CONFIG_WAITQ_LINK_STATS
1097 thread_t sl_alloc_th;
1098 task_t sl_alloc_task;
1099 uintptr_t sl_alloc_bt[NWAITQ_BTFRAMES];
1100 uint64_t sl_alloc_ts;
1101 uintptr_t sl_invalidate_bt[NWAITQ_BTFRAMES];
1102 uint64_t sl_invalidate_ts;
1103 uintptr_t sl_mkvalid_bt[NWAITQ_BTFRAMES];
1104 uint64_t sl_mkvalid_ts;
1105 uint64_t sl_free_ts;
1106 #endif
1107 };
1108 #if !defined(CONFIG_WAITQ_LINK_STATS)
1109 _Static_assert((sizeof(struct setid_link) & (sizeof(struct setid_link) - 1)) == 0,
1110 "setid_link struct must be a power of two!");
1111 #endif
1112
1113 #define sl_refcnt(link) \
1114 (wqt_bits_refcnt((link)->wqte.wqt_bits))
1115
1116 #define sl_type(link) \
1117 (wqt_bits_type((link)->wqte.wqt_bits))
1118
1119 #define sl_set_valid(link) \
1120 do { \
1121 wqt_elem_mkvalid(&(link)->wqte); \
1122 lt_do_mkvalid_stats(&(link)->wqte); \
1123 } while (0)
1124
1125 #define sl_is_valid(link) \
1126 wqt_bits_valid((link)->wqte.wqt_bits)
1127
1128 #define sl_set_id wqte.wqt_id
1129
1130 #define SLT_WQS_POISON ((void *)(0xf00df00d))
1131 #define SLT_LINK_POISON (0x0bad0badffffffffull)
1132
1133 static void lt_poison(struct wq_table *table, struct wqt_elem *elem)
1134 {
1135 struct setid_link *sl_link = (struct setid_link *)elem;
1136 (void)table;
1137
1138 switch (sl_type(sl_link)) {
1139 case SLT_WQS:
1140 sl_link->sl_wqs.sl_set = SLT_WQS_POISON;
1141 break;
1142 case SLT_LINK:
1143 sl_link->sl_link.sl_left_setid = SLT_LINK_POISON;
1144 sl_link->sl_link.sl_right_setid = SLT_LINK_POISON;
1145 break;
1146 default:
1147 break;
1148 }
1149 #ifdef CONFIG_WAITQ_LINK_STATS
1150 memset(sl_link->sl_alloc_bt, 0, sizeof(sl_link->sl_alloc_bt));
1151 sl_link->sl_alloc_ts = 0;
1152 memset(sl_link->sl_mkvalid_bt, 0, sizeof(sl_link->sl_mkvalid_bt));
1153 sl_link->sl_mkvalid_ts = 0;
1154
1155 sl_link->sl_alloc_th = THREAD_NULL;
1156 /* leave the sl_alloc_task in place for debugging */
1157
1158 sl_link->sl_free_ts = mach_absolute_time();
1159 #endif
1160 }
1161
1162 #ifdef CONFIG_WAITQ_LINK_STATS
1163 static __inline__ void lt_do_alloc_stats(struct wqt_elem *elem)
1164 {
1165 if (elem) {
1166 struct setid_link *link = (struct setid_link *)elem;
1167 memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
1168 waitq_grab_backtrace(link->sl_alloc_bt, 0);
1169 link->sl_alloc_th = current_thread();
1170 link->sl_alloc_task = current_task();
1171
1172 assert(link->sl_alloc_ts == 0);
1173 link->sl_alloc_ts = mach_absolute_time();
1174
1175 memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
1176 link->sl_invalidate_ts = 0;
1177 }
1178 }
1179
1180 static __inline__ void lt_do_invalidate_stats(struct wqt_elem *elem)
1181 {
1182 struct setid_link *link = (struct setid_link *)elem;
1183
1184 if (!elem)
1185 return;
1186
1187 assert(link->sl_mkvalid_ts > 0);
1188
1189 memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
1190 link->sl_invalidate_ts = mach_absolute_time();
1191 waitq_grab_backtrace(link->sl_invalidate_bt, 0);
1192 }
1193
1194 static __inline__ void lt_do_mkvalid_stats(struct wqt_elem *elem)
1195 {
1196 struct setid_link *link = (struct setid_link *)elem;
1197
1198 if (!elem)
1199 return;
1200
1201 memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));
1202 link->sl_mkvalid_ts = mach_absolute_time();
1203 waitq_grab_backtrace(link->sl_mkvalid_bt, 0);
1204 }
1205 #else
1206 #define lt_do_alloc_stats(e)
1207 #define lt_do_invalidate_stats(e)
1208 #define lt_do_mkvalid_stats(e)
1209 #endif /* CONFIG_WAITQ_LINK_STATS */
1210
1211 static void lt_init(void)
1212 {
1213 uint32_t tablesz = 0, max_links = 0;
1214
1215 if (PE_parse_boot_argn("wql_tsize", &tablesz, sizeof(tablesz)) != TRUE)
1216 tablesz = (uint32_t)g_wqt_max_tbl_size;
1217
1218 tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
1219 max_links = tablesz / sizeof(struct setid_link);
1220 assert(max_links > 0 && tablesz > 0);
1221
1222 /* we have a restricted index range */
1223 if (max_links > (WQT_IDX_MAX + 1))
1224 max_links = WQT_IDX_MAX + 1;
1225
1226 wqinfo("init linktable with max:%d elements (%d bytes)",
1227 max_links, tablesz);
1228 wq_table_init(&g_linktable, "wqslab.links", max_links,
1229 sizeof(struct setid_link), lt_poison);
1230 }
1231
1232 static void lt_ensure_free_space(void)
1233 {
1234 if (g_linktable.nelem - g_linktable.used_elem < g_min_free_table_elem) {
1235 /*
1236 * we don't hold locks on these values, so check for underflow
1237 */
1238 if (g_linktable.used_elem <= g_linktable.nelem) {
1239 wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d",
1240 g_linktable.nelem, g_linktable.used_elem,
1241 g_min_free_table_elem);
1242 wq_table_grow(&g_linktable, g_min_free_table_elem);
1243 }
1244 }
1245 }
1246
1247 static struct setid_link *lt_alloc_link(int type)
1248 {
1249 struct wqt_elem *elem;
1250
1251 elem = wq_table_alloc_elem(&g_linktable, type, 1);
1252 lt_do_alloc_stats(elem);
1253 return (struct setid_link *)elem;
1254 }
1255
1256 static void lt_realloc_link(struct setid_link *link, int type)
1257 {
1258 wq_table_realloc_elem(&g_linktable, &link->wqte, type);
1259 #ifdef CONFIG_WAITQ_LINK_STATS
1260 memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
1261 link->sl_alloc_ts = 0;
1262 lt_do_alloc_stats(&link->wqte);
1263
1264 memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
1265 link->sl_invalidate_ts = 0;
1266 #endif
1267 }
1268
1269 static void lt_invalidate(struct setid_link *link)
1270 {
1271 wqt_elem_invalidate(&link->wqte);
1272 lt_do_invalidate_stats(&link->wqte);
1273 }
1274
1275 static struct setid_link *lt_get_link(uint64_t setid)
1276 {
1277 struct wqt_elem *elem;
1278
1279 elem = wq_table_get_elem(&g_linktable, setid);
1280 return (struct setid_link *)elem;
1281 }
1282
1283 static void lt_put_link(struct setid_link *link)
1284 {
1285 if (!link)
1286 return;
1287 wq_table_put_elem(&g_linktable, (struct wqt_elem *)link);
1288 }
1289
1290 static struct setid_link *lt_get_reserved(uint64_t setid, int type)
1291 {
1292 struct wqt_elem *elem;
1293
1294 elem = wqt_elem_list_first(&g_linktable, setid);
1295 if (!elem)
1296 return NULL;
1297 wq_table_realloc_elem(&g_linktable, elem, type);
1298 return (struct setid_link *)elem;
1299 }
1300
1301
1302 static inline int waitq_maybe_remove_link(struct waitq *waitq,
1303 uint64_t setid,
1304 struct setid_link *parent,
1305 struct setid_link *left,
1306 struct setid_link *right);
1307
1308 enum {
1309 LINK_WALK_ONE_LEVEL = 0,
1310 LINK_WALK_FULL_DAG = 1,
1311 LINK_WALK_FULL_DAG_UNLOCKED = 2,
1312 };
1313
1314 typedef int (*lt_callback_func)(struct waitq *waitq, void *ctx,
1315 struct setid_link *link);
1316
1317 /**
1318 * walk all table elements (of type 'link_type') pointed to by 'setid'
1319 *
1320 * Conditions:
1321 * waitq is locked (or NULL)
1322 * 'setid' is managed by 'waitq'
1323 * this could be direct (waitq->waitq_set_id == setid)
1324 * OR indirect (setid is the left/right ID in a LINK chain,
1325 * whose root is waitq->waitq_set_id)
1326 *
1327 * Notes:
1328 * This function uses recursion to walk the set of table elements
1329 * pointed to by 'setid'. For each element encountered, 'cb' will be
1330 * called. If non-zero, the return value of this callback function can
1331 * early-out of the table walk.
1332 *
1333 * For each link element encountered, the function takes a reference to
1334 * it. The reference is dropped only after the callback and any recursion
1335 * has completed.
1336 *
1337 * The assumed table/link/tree structure:
1338 * 'setid'
1339 * / \
1340 * / \
1341 * L(LINK) R(LINK)
1342 * /\ /\
1343 * / \ / \
1344 * / \ Rl(*) Rr(*)
1345 * Ll(*) Lr(*) /\ /\
1346 * /\ /\ ... ... ... ...
1347 * ... ... ... ...
1348 * \
1349 * WQS(wqset_q.waitq_setid == Sx)
1350 * [waitq set is a membet of setid, 'Sx')
1351 *
1352 * 'Sx'
1353 * / \
1354 * / \
1355 * L(LINK) R(LINK)
1356 * /\ /\
1357 * ... ... ... ...
1358 *
1359 * The basic algorithm is as follows:
1360 * *) take a reference to the table object pointed to by 'setid'
1361 * *) if appropriate, call 'cb' (potentially early-out on non-zero return)
1362 * *) if the link object points to a waitq set, and the walk type
1363 * is 'FULL_DAG' (full directed-acyclic-graph), then try to lock
1364 * the associated waitq set object and recursively walk all sets to
1365 * which that set belongs. This is a DFS of the tree structure.
1366 * *) recurse down the left side of the tree (following the
1367 * 'sl_left_setid' pointer in the link object
1368 * *) recurse down the right side of the tree (following the
1369 * 'sl_right_setid' pointer in the link object
1370 */
1371 static __attribute__((noinline))
1372 int walk_setid_links(int walk_type, struct waitq *waitq,
1373 uint64_t setid, int link_type,
1374 void *ctx, lt_callback_func cb)
1375 {
1376 struct setid_link *link;
1377 uint64_t nextid;
1378 int sl_type;
1379
1380 link = lt_get_link(setid);
1381
1382 /* invalid link */
1383 if (!link)
1384 return WQ_ITERATE_CONTINUE;
1385
1386 setid = nextid = 0;
1387 sl_type = sl_type(link);
1388 if (sl_type == SLT_LINK) {
1389 setid = link->sl_link.sl_left_setid;
1390 nextid = link->sl_link.sl_right_setid;
1391 }
1392
1393 /*
1394 * Make the callback only on specified link_type (or all links)
1395 * Note that after the callback, the link object may be
1396 * invalid. The only valid thing we can do is put our
1397 * reference to it (which may put it back on the free list)
1398 */
1399 if (link_type == SLT_ALL || link_type == sl_type) {
1400 /* allow the callback to early-out */
1401 int ret = cb(waitq, ctx, link);
1402 if (ret != WQ_ITERATE_CONTINUE) {
1403 lt_put_link(link);
1404 return ret;
1405 }
1406 }
1407
1408 if (sl_type == SLT_WQS &&
1409 (walk_type == LINK_WALK_FULL_DAG ||
1410 walk_type == LINK_WALK_FULL_DAG_UNLOCKED)) {
1411 /*
1412 * Recurse down any sets to which this wait queue set was
1413 * added. We do this just before we put our reference to
1414 * the link object (which may free it).
1415 */
1416 struct waitq_set *wqset = link->sl_wqs.sl_set;
1417 int ret = WQ_ITERATE_CONTINUE;
1418 int get_spl = 0;
1419 int should_unlock = 0;
1420 uint64_t wqset_setid = 0;
1421 spl_t set_spl;
1422
1423 if (waitq_set_is_valid(wqset) && walk_type == LINK_WALK_FULL_DAG) {
1424 if ((!waitq || !waitq_irq_safe(waitq)) &&
1425 waitq_irq_safe(&wqset->wqset_q)) {
1426 get_spl = 1;
1427 set_spl = splsched();
1428 }
1429 waitq_set_lock(wqset);
1430 should_unlock = 1;
1431 }
1432
1433 /*
1434 * verify the linked waitq set as it could have been
1435 * invalidated before we grabbed the lock!
1436 */
1437 if (wqset->wqset_id != link->sl_set_id.id) {
1438 /*This is the bottom of the tree: just get out */
1439 if (should_unlock) {
1440 waitq_set_unlock(wqset);
1441 if (get_spl)
1442 splx(set_spl);
1443 }
1444 lt_put_link(link);
1445 return WQ_ITERATE_CONTINUE;
1446 }
1447
1448 wqset_setid = wqset->wqset_q.waitq_set_id;
1449
1450 if (wqset_setid > 0)
1451 ret = walk_setid_links(walk_type, &wqset->wqset_q,
1452 wqset_setid, link_type, ctx, cb);
1453 if (should_unlock) {
1454 waitq_set_unlock(wqset);
1455 if (get_spl)
1456 splx(set_spl);
1457 }
1458 if (ret != WQ_ITERATE_CONTINUE) {
1459 lt_put_link(link);
1460 return ret;
1461 }
1462 }
1463
1464 lt_put_link(link);
1465
1466 /* recurse down left side of the tree */
1467 if (setid) {
1468 int ret = walk_setid_links(walk_type, waitq, setid, link_type, ctx, cb);
1469 if (ret != WQ_ITERATE_CONTINUE)
1470 return ret;
1471 }
1472
1473 /* recurse down right side of the tree */
1474 if (nextid)
1475 return walk_setid_links(walk_type, waitq, nextid, link_type, ctx, cb);
1476
1477 return WQ_ITERATE_CONTINUE;
1478 }
1479
1480 /* ----------------------------------------------------------------------
1481 *
1482 * Prepost Link Table Implementation
1483 *
1484 * ---------------------------------------------------------------------- */
1485 static struct wq_table g_prepost_table;
1486
1487 enum wq_prepost_type {
1488 WQP_FREE = WQT_FREE,
1489 WQP_WQ = WQT_ELEM,
1490 WQP_POST = WQT_LINK,
1491 };
1492
1493 struct wq_prepost {
1494 struct wqt_elem wqte;
1495
1496 union {
1497 /* wqt_type == WQP_WQ (WQT_ELEM) */
1498 struct {
1499 struct waitq *wqp_wq_ptr;
1500 } wqp_wq;
1501 /* wqt_type == WQP_POST (WQT_LINK) */
1502 struct {
1503 uint64_t wqp_next_id;
1504 uint64_t wqp_wq_id;
1505 } wqp_post;
1506 };
1507 #ifdef CONFIG_WAITQ_PREPOST_STATS
1508 thread_t wqp_alloc_th;
1509 task_t wqp_alloc_task;
1510 uintptr_t wqp_alloc_bt[NWAITQ_BTFRAMES];
1511 #endif
1512 };
1513 #if !defined(CONFIG_WAITQ_PREPOST_STATS)
1514 _Static_assert((sizeof(struct wq_prepost) & (sizeof(struct wq_prepost) - 1)) == 0,
1515 "wq_prepost struct must be a power of two!");
1516 #endif
1517
1518 #define wqp_refcnt(wqp) \
1519 (wqt_bits_refcnt((wqp)->wqte.wqt_bits))
1520
1521 #define wqp_type(wqp) \
1522 (wqt_bits_type((wqp)->wqte.wqt_bits))
1523
1524 #define wqp_set_valid(wqp) \
1525 wqt_elem_mkvalid(&(wqp)->wqte)
1526
1527 #define wqp_is_valid(wqp) \
1528 wqt_bits_valid((wqp)->wqte.wqt_bits)
1529
1530 #define wqp_prepostid wqte.wqt_id
1531
1532 #define WQP_WQ_POISON (0x0bad0badffffffffull)
1533 #define WQP_POST_POISON (0xf00df00df00df00d)
1534
1535 static void wqp_poison(struct wq_table *table, struct wqt_elem *elem)
1536 {
1537 struct wq_prepost *wqp = (struct wq_prepost *)elem;
1538 (void)table;
1539
1540 switch (wqp_type(wqp)) {
1541 case WQP_WQ:
1542 break;
1543 case WQP_POST:
1544 wqp->wqp_post.wqp_next_id = WQP_POST_POISON;
1545 wqp->wqp_post.wqp_wq_id = WQP_POST_POISON;
1546 break;
1547 default:
1548 break;
1549 }
1550 }
1551
1552 #ifdef CONFIG_WAITQ_PREPOST_STATS
1553 static __inline__ void wqp_do_alloc_stats(struct wqt_elem *elem)
1554 {
1555 if (elem) {
1556 struct wq_prepost *wqp = (struct wq_prepost *)elem;
1557
1558 /* be sure the take stats for _all_ allocated objects */
1559 for (;;) {
1560 uint32_t next_idx;
1561
1562 memset(wqp->wqp_alloc_bt, 0, sizeof(wqp->wqp_alloc_bt));
1563 waitq_grab_backtrace(wqp->wqp_alloc_bt, 4);
1564 wqp->wqp_alloc_th = current_thread();
1565 wqp->wqp_alloc_task = current_task();
1566 next_idx = wqp->wqte.wqt_next_idx;
1567
1568 if (next_idx == WQT_IDX_MAX)
1569 break;
1570 assert(next_idx < g_prepost_table.nelem);
1571
1572 wqp = (struct wq_prepost *)wqt_elem_idx(&g_prepost_table,
1573 next_idx);
1574 }
1575 }
1576 }
1577 #else
1578 #define wqp_do_alloc_stats(e)
1579 #endif /* CONFIG_WAITQ_LINK_STATS */
1580
1581 static void wqp_init(void)
1582 {
1583 uint32_t tablesz = 0, max_wqp = 0;
1584
1585 if (PE_parse_boot_argn("wqp_tsize", &tablesz, sizeof(tablesz)) != TRUE)
1586 tablesz = (uint32_t)g_wqt_max_tbl_size;
1587
1588 tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
1589 max_wqp = tablesz / sizeof(struct wq_prepost);
1590 assert(max_wqp > 0 && tablesz > 0);
1591
1592 /* we have a restricted index range */
1593 if (max_wqp > (WQT_IDX_MAX + 1))
1594 max_wqp = WQT_IDX_MAX + 1;
1595
1596 wqinfo("init prepost table with max:%d elements (%d bytes)",
1597 max_wqp, tablesz);
1598 wq_table_init(&g_prepost_table, "wqslab.prepost", max_wqp,
1599 sizeof(struct wq_prepost), wqp_poison);
1600 }
1601
1602 /*
1603 * Refill the per-CPU cache.
1604 */
1605 static void wq_prepost_refill_cpu_cache(uint32_t nalloc)
1606 {
1607 struct wqt_elem *new_head, *old_head;
1608 struct wqp_cache *cache;
1609
1610 /* require preemption enabled to allocate elements */
1611 if (get_preemption_level() != 0)
1612 return;
1613
1614 new_head = wq_table_alloc_elem(&g_prepost_table,
1615 WQT_RESERVED, nalloc);
1616 if (new_head == NULL)
1617 return;
1618
1619 disable_preemption();
1620 cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
1621 cache->avail += nalloc;
1622 if (cache->head == 0 || cache->head == WQT_IDX_MAX) {
1623 cache->head = new_head->wqt_id.id;
1624 goto out;
1625 }
1626
1627 old_head = wqt_elem_list_first(&g_prepost_table, cache->head);
1628 (void)wqt_elem_list_link(&g_prepost_table, new_head, old_head);
1629 cache->head = new_head->wqt_id.id;
1630
1631 out:
1632 enable_preemption();
1633 return;
1634 }
1635
1636 static void wq_prepost_ensure_free_space(void)
1637 {
1638 uint32_t free_elem;
1639 uint32_t min_free;
1640 struct wqp_cache *cache;
1641
1642 if (g_min_free_cache == 0)
1643 g_min_free_cache = (WQP_CACHE_MAX * ml_get_max_cpus());
1644
1645 /*
1646 * Ensure that we always have a pool of per-CPU prepost elements
1647 */
1648 disable_preemption();
1649 cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
1650 free_elem = cache->avail;
1651 enable_preemption();
1652
1653 if (free_elem < (WQP_CACHE_MAX / 3))
1654 wq_prepost_refill_cpu_cache(WQP_CACHE_MAX - free_elem);
1655
1656 /*
1657 * Now ensure that we have a sufficient amount of free table space
1658 */
1659 free_elem = g_prepost_table.nelem - g_prepost_table.used_elem;
1660 min_free = g_min_free_table_elem + g_min_free_cache;
1661 if (free_elem < min_free) {
1662 /*
1663 * we don't hold locks on these values, so check for underflow
1664 */
1665 if (g_prepost_table.used_elem <= g_prepost_table.nelem) {
1666 wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d+%d",
1667 g_prepost_table.nelem, g_prepost_table.used_elem,
1668 g_min_free_table_elem, g_min_free_cache);
1669 wq_table_grow(&g_prepost_table, min_free);
1670 }
1671 }
1672 }
1673
1674 static struct wq_prepost *wq_prepost_alloc(int type, int nelem)
1675 {
1676 struct wqt_elem *elem;
1677 struct wq_prepost *wqp;
1678 struct wqp_cache *cache;
1679
1680 if (type != WQT_RESERVED)
1681 goto do_alloc;
1682 if (nelem == 0)
1683 return NULL;
1684
1685 /*
1686 * First try to grab the elements from the per-CPU cache if we are
1687 * allocating RESERVED elements
1688 */
1689 disable_preemption();
1690 cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
1691 if (nelem <= (int)cache->avail) {
1692 struct wqt_elem *first, *next = NULL;
1693 int nalloc = nelem;
1694
1695 cache->avail -= nelem;
1696
1697 /* grab the first element */
1698 first = wqt_elem_list_first(&g_prepost_table, cache->head);
1699
1700 /* find the last element and re-adjust the cache head */
1701 for (elem = first; elem != NULL && nalloc > 0; elem = next) {
1702 next = wqt_elem_list_next(&g_prepost_table, elem);
1703 if (--nalloc == 0) {
1704 /* terminate the allocated list */
1705 elem->wqt_next_idx = WQT_IDX_MAX;
1706 break;
1707 }
1708 }
1709 assert(nalloc == 0);
1710 if (!next)
1711 cache->head = WQT_IDX_MAX;
1712 else
1713 cache->head = next->wqt_id.id;
1714 /* assert that we don't have mis-matched book keeping */
1715 assert(!(cache->head == WQT_IDX_MAX && cache->avail > 0));
1716 enable_preemption();
1717 elem = first;
1718 goto out;
1719 }
1720 enable_preemption();
1721
1722 do_alloc:
1723 /* fall-back to standard table allocation */
1724 elem = wq_table_alloc_elem(&g_prepost_table, type, nelem);
1725 if (!elem)
1726 return NULL;
1727
1728 out:
1729 wqp = (struct wq_prepost *)elem;
1730 wqp_do_alloc_stats(elem);
1731 return wqp;
1732 }
1733
1734 /*
1735 static void wq_prepost_realloc(struct wq_prepost *wqp, int type)
1736 {
1737 wq_table_realloc_elem(&g_prepost_table, &wqp->wqte, type);
1738 }
1739 */
1740
1741 static void wq_prepost_invalidate(struct wq_prepost *wqp)
1742 {
1743 wqt_elem_invalidate(&wqp->wqte);
1744 }
1745
1746 static struct wq_prepost *wq_prepost_get(uint64_t wqp_id)
1747 {
1748 struct wqt_elem *elem;
1749
1750 elem = wq_table_get_elem(&g_prepost_table, wqp_id);
1751 return (struct wq_prepost *)elem;
1752 }
1753
1754 static void wq_prepost_put(struct wq_prepost *wqp)
1755 {
1756 wq_table_put_elem(&g_prepost_table, (struct wqt_elem *)wqp);
1757 }
1758
1759 static int wq_prepost_rlink(struct wq_prepost *parent, struct wq_prepost *child)
1760 {
1761 return wqt_elem_list_link(&g_prepost_table, &parent->wqte, &child->wqte);
1762 }
1763
1764 static struct wq_prepost *wq_prepost_get_rnext(struct wq_prepost *head)
1765 {
1766 struct wqt_elem *elem;
1767 struct wq_prepost *wqp;
1768 uint64_t id;
1769
1770 elem = wqt_elem_list_next(&g_prepost_table, &head->wqte);
1771 if (!elem)
1772 return NULL;
1773 id = elem->wqt_id.id;
1774 elem = wq_table_get_elem(&g_prepost_table, id);
1775
1776 if (!elem)
1777 return NULL;
1778 wqp = (struct wq_prepost *)elem;
1779 if (elem->wqt_id.id != id ||
1780 wqp_type(wqp) != WQP_POST ||
1781 wqp->wqp_post.wqp_next_id != head->wqp_prepostid.id) {
1782 wq_table_put_elem(&g_prepost_table, elem);
1783 return NULL;
1784 }
1785
1786 return wqp;
1787 }
1788
1789 static void wq_prepost_reset_rnext(struct wq_prepost *wqp)
1790 {
1791 wqt_elem_reset_next(&g_prepost_table, &wqp->wqte);
1792 }
1793
1794
1795 /**
1796 * remove 'wqp' from the prepost list on 'wqset'
1797 *
1798 * Conditions:
1799 * wqset is locked
1800 * caller holds a reference on wqp (and is responsible to release it)
1801 *
1802 * Result:
1803 * wqp is invalidated, wqset is potentially updated with a new
1804 * prepost ID, and the next element of the prepost list may be
1805 * consumed as well (if the list contained only 2 objects)
1806 */
1807 static int wq_prepost_remove(struct waitq_set *wqset,
1808 struct wq_prepost *wqp)
1809 {
1810 int more_posts = 1;
1811 uint64_t next_id = wqp->wqp_post.wqp_next_id;
1812 uint64_t wqp_id = wqp->wqp_prepostid.id;
1813 struct wq_prepost *prev_wqp, *next_wqp;
1814
1815 assert(wqp_type(wqp) == WQP_POST);
1816
1817 if (next_id == wqp_id) {
1818 /* the list is singular and becoming empty */
1819 wqset->wqset_prepost_id = 0;
1820 more_posts = 0;
1821 goto out;
1822 }
1823
1824 prev_wqp = wq_prepost_get_rnext(wqp);
1825 assert(prev_wqp != NULL);
1826 assert(prev_wqp->wqp_post.wqp_next_id == wqp_id);
1827 assert(prev_wqp->wqp_prepostid.id != wqp_id);
1828 assert(wqp_type(prev_wqp) == WQP_POST);
1829
1830 if (prev_wqp->wqp_prepostid.id == next_id) {
1831 /*
1832 * There are two items in the list, and we're removing one. We
1833 * only need to keep the WQP_WQ pointer from 'prev_wqp'
1834 */
1835 wqset->wqset_prepost_id = prev_wqp->wqp_post.wqp_wq_id;
1836 wq_prepost_invalidate(prev_wqp);
1837 wq_prepost_put(prev_wqp);
1838 more_posts = 0;
1839 goto out;
1840 }
1841
1842 /* prev->next = next */
1843 prev_wqp->wqp_post.wqp_next_id = next_id;
1844
1845 /* next->prev = prev */
1846 next_wqp = wq_prepost_get(next_id);
1847 assert(next_wqp != NULL);
1848 assert(next_wqp != wqp);
1849 assert(next_wqp != prev_wqp);
1850 assert(wqp_type(next_wqp) == WQP_POST);
1851
1852 wq_prepost_reset_rnext(next_wqp);
1853 wq_prepost_rlink(next_wqp, prev_wqp);
1854
1855 /* If we remove the head of the list, update the wqset */
1856 if (wqp_id == wqset->wqset_prepost_id)
1857 wqset->wqset_prepost_id = next_id;
1858
1859 wq_prepost_put(prev_wqp);
1860 wq_prepost_put(next_wqp);
1861
1862 out:
1863 wq_prepost_reset_rnext(wqp);
1864 wq_prepost_invalidate(wqp);
1865 return more_posts;
1866 }
1867
1868 static struct wq_prepost *wq_prepost_rfirst(uint64_t id)
1869 {
1870 struct wqt_elem *elem;
1871 elem = wqt_elem_list_first(&g_prepost_table, id);
1872 wqp_do_alloc_stats(elem);
1873 return (struct wq_prepost *)(void *)elem;
1874 }
1875
1876 static struct wq_prepost *wq_prepost_rpop(uint64_t *id, int type)
1877 {
1878 struct wqt_elem *elem;
1879 elem = wqt_elem_list_pop(&g_prepost_table, id, type);
1880 wqp_do_alloc_stats(elem);
1881 return (struct wq_prepost *)(void *)elem;
1882 }
1883
1884 static void wq_prepost_release_rlist(struct wq_prepost *wqp)
1885 {
1886 int nelem = 0;
1887 struct wqp_cache *cache;
1888 struct wqt_elem *elem;
1889
1890 if (!wqp)
1891 return;
1892
1893 elem = &wqp->wqte;
1894
1895 /*
1896 * These are reserved elements: release them back to the per-cpu pool
1897 * if our cache is running low.
1898 */
1899 disable_preemption();
1900 cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
1901 if (cache->avail < WQP_CACHE_MAX) {
1902 struct wqt_elem *tmp = NULL;
1903 if (cache->head != WQT_IDX_MAX)
1904 tmp = wqt_elem_list_first(&g_prepost_table, cache->head);
1905 nelem = wqt_elem_list_link(&g_prepost_table, elem, tmp);
1906 cache->head = elem->wqt_id.id;
1907 cache->avail += nelem;
1908 enable_preemption();
1909 return;
1910 }
1911 enable_preemption();
1912
1913 /* release these elements back to the main table */
1914 nelem = wqt_elem_list_release(&g_prepost_table, elem, WQT_RESERVED);
1915
1916 #if CONFIG_WAITQ_STATS
1917 g_prepost_table.nreserved_releases += 1;
1918 OSDecrementAtomic64(&g_prepost_table.nreservations);
1919 #endif
1920 }
1921
1922 typedef int (*wqp_callback_func)(struct waitq_set *wqset,
1923 void *ctx,
1924 struct wq_prepost *wqp,
1925 struct waitq *waitq);
1926
1927 /**
1928 * iterate over a chain of preposts associated with a waitq set.
1929 *
1930 * Conditions:
1931 * wqset is locked
1932 *
1933 * Notes:
1934 * This loop performs automatic prepost chain management / culling, and
1935 * may reset or adjust the waitq set's prepost ID pointer. If you don't
1936 * want this extra processing, you can use wq_prepost_iterate().
1937 */
1938 static int wq_prepost_foreach_locked(struct waitq_set *wqset,
1939 void *ctx, wqp_callback_func cb)
1940 {
1941 int ret;
1942 struct wq_prepost *wqp, *tmp_wqp;
1943
1944 if (!wqset || !wqset->wqset_prepost_id)
1945 return WQ_ITERATE_SUCCESS;
1946
1947 restart:
1948 wqp = wq_prepost_get(wqset->wqset_prepost_id);
1949 if (!wqp) {
1950 /*
1951 * The prepost object is no longer valid, reset the waitq
1952 * set's prepost id.
1953 */
1954 wqset->wqset_prepost_id = 0;
1955 return WQ_ITERATE_SUCCESS;
1956 }
1957
1958 if (wqp_type(wqp) == WQP_WQ) {
1959 uint64_t __assert_only wqp_id = wqp->wqp_prepostid.id;
1960 if (cb)
1961 ret = cb(wqset, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
1962
1963 switch (ret) {
1964 case WQ_ITERATE_INVALIDATE_CONTINUE:
1965 /* the caller wants to remove the only prepost here */
1966 assert(wqp_id == wqset->wqset_prepost_id);
1967 wqset->wqset_prepost_id = 0;
1968 /* fall through */
1969 case WQ_ITERATE_CONTINUE:
1970 wq_prepost_put(wqp);
1971 ret = WQ_ITERATE_SUCCESS;
1972 break;
1973 case WQ_ITERATE_RESTART:
1974 wq_prepost_put(wqp);
1975 /* fall through */
1976 case WQ_ITERATE_DROPPED:
1977 goto restart;
1978 default:
1979 wq_prepost_put(wqp);
1980 break;
1981 }
1982 return ret;
1983 }
1984
1985 assert(wqp->wqp_prepostid.id == wqset->wqset_prepost_id);
1986 assert(wqp_type(wqp) == WQP_POST);
1987
1988 /*
1989 * At this point we know we have a list of POST objects.
1990 * Grab a handle to the last element in the list and start
1991 * the iteration.
1992 */
1993 tmp_wqp = wq_prepost_get_rnext(wqp);
1994 assert(tmp_wqp != NULL && wqp_type(tmp_wqp) == WQP_POST);
1995
1996 uint64_t last_id = tmp_wqp->wqp_prepostid.id;
1997 wq_prepost_put(tmp_wqp);
1998
1999 ret = WQ_ITERATE_SUCCESS;
2000 for (;;) {
2001 uint64_t wqp_id, first_id, next_id;
2002
2003 wqp_id = wqp->wqp_prepostid.id;
2004 first_id = wqset->wqset_prepost_id;
2005 next_id = wqp->wqp_post.wqp_next_id;
2006
2007 /* grab the WQP_WQ object this _POST points to */
2008 tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
2009 if (!tmp_wqp) {
2010 /*
2011 * This WQP_POST object points to an invalid
2012 * WQP_WQ object - remove the POST object from
2013 * the list.
2014 */
2015 if (wq_prepost_remove(wqset, wqp) == 0) {
2016 wq_prepost_put(wqp);
2017 goto restart;
2018 }
2019 goto next_prepost;
2020 }
2021 assert(wqp_type(tmp_wqp) == WQP_WQ);
2022 /*
2023 * make the callback: note that this could remove 'wqp' or
2024 * drop the lock on our waitq set. We need to re-validate
2025 * our state when this function returns.
2026 */
2027 if (cb)
2028 ret = cb(wqset, ctx, wqp,
2029 tmp_wqp->wqp_wq.wqp_wq_ptr);
2030 wq_prepost_put(tmp_wqp);
2031
2032 switch (ret) {
2033 case WQ_ITERATE_CONTINUE:
2034 /* continue iteration */
2035 break;
2036 case WQ_ITERATE_INVALIDATE_CONTINUE:
2037 assert(next_id == wqp->wqp_post.wqp_next_id);
2038 if (wq_prepost_remove(wqset, wqp) == 0) {
2039 wq_prepost_put(wqp);
2040 goto restart;
2041 }
2042 goto next_prepost;
2043 case WQ_ITERATE_RESTART:
2044 wq_prepost_put(wqp);
2045 /* fall-through */
2046 case WQ_ITERATE_DROPPED:
2047 /* the callback dropped the ref to wqp: just restart */
2048 goto restart;
2049 default:
2050 /* break out of the iteration for some other reason */
2051 goto finish_prepost_foreach;
2052 }
2053
2054 /*
2055 * the set lock may have been dropped during callback,
2056 * if something looks different, restart the prepost iteration
2057 */
2058 if (!wqp_is_valid(wqp) ||
2059 (wqp->wqp_post.wqp_next_id != next_id) ||
2060 wqset->wqset_prepost_id != first_id) {
2061 wq_prepost_put(wqp);
2062 goto restart;
2063 }
2064
2065 next_prepost:
2066 /* this was the last object in the list */
2067 if (wqp_id == last_id)
2068 break;
2069
2070 /* get the next object */
2071 tmp_wqp = wq_prepost_get(next_id);
2072 if (!tmp_wqp) {
2073 /*
2074 * At this point we've already checked our state
2075 * after the callback (which may have dropped the set
2076 * lock). If we find an invalid member of the list
2077 * then something is wrong.
2078 */
2079 panic("Invalid WQP_POST member 0x%llx in waitq set "
2080 "0x%llx prepost list (first:%llx, "
2081 "wqp:%p)",
2082 next_id, wqset->wqset_id, first_id, wqp);
2083 }
2084 wq_prepost_put(wqp);
2085 wqp = tmp_wqp;
2086
2087 assert(wqp_type(wqp) == WQP_POST);
2088 }
2089
2090 finish_prepost_foreach:
2091 wq_prepost_put(wqp);
2092 if (ret == WQ_ITERATE_CONTINUE)
2093 ret = WQ_ITERATE_SUCCESS;
2094
2095 return ret;
2096 }
2097
2098 /**
2099 * Perform a simple loop over a chain of prepost objects
2100 *
2101 * Conditions:
2102 * If 'prepost_id' is associated with a waitq (set) then that object must
2103 * be locked before calling this function.
2104 * Callback function, 'cb', must be able to handle a NULL wqset pointer
2105 * and a NULL waitq pointer!
2106 *
2107 * Notes:
2108 * This prepost chain iteration will _not_ automatically adjust any chain
2109 * element or linkage. This is the responsibility of the caller! If you
2110 * want automatic prepost chain management (at a cost of extra CPU time),
2111 * you can use: wq_prepost_foreach_locked().
2112 */
2113 static int wq_prepost_iterate(uint64_t prepost_id,
2114 void *ctx, wqp_callback_func cb)
2115 {
2116 int ret;
2117 struct wq_prepost *wqp;
2118
2119 if (!prepost_id)
2120 return WQ_ITERATE_SUCCESS;
2121
2122 wqp = wq_prepost_get(prepost_id);
2123 if (!wqp)
2124 return WQ_ITERATE_SUCCESS;
2125
2126 if (wqp_type(wqp) == WQP_WQ) {
2127 ret = WQ_ITERATE_SUCCESS;
2128 if (cb)
2129 ret = cb(NULL, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
2130
2131 if (ret != WQ_ITERATE_DROPPED)
2132 wq_prepost_put(wqp);
2133 return ret;
2134 }
2135
2136 assert(wqp->wqp_prepostid.id == prepost_id);
2137 assert(wqp_type(wqp) == WQP_POST);
2138
2139 /* at this point we know we have a list of POST objects */
2140 uint64_t next_id;
2141
2142 ret = WQ_ITERATE_CONTINUE;
2143 do {
2144 struct wq_prepost *tmp_wqp;
2145 struct waitq *wq = NULL;
2146
2147 next_id = wqp->wqp_post.wqp_next_id;
2148
2149 /* grab the WQP_WQ object this _POST points to */
2150 tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
2151 if (tmp_wqp) {
2152 assert(wqp_type(tmp_wqp) == WQP_WQ);
2153 wq = tmp_wqp->wqp_wq.wqp_wq_ptr;
2154 }
2155
2156 if (cb)
2157 ret = cb(NULL, ctx, wqp, wq);
2158 if (tmp_wqp)
2159 wq_prepost_put(tmp_wqp);
2160
2161 if (ret != WQ_ITERATE_CONTINUE)
2162 break;
2163
2164 tmp_wqp = wq_prepost_get(next_id);
2165 if (!tmp_wqp) {
2166 /*
2167 * the chain is broken: nothing we can do here besides
2168 * bail from the iteration.
2169 */
2170 ret = WQ_ITERATE_ABORTED;
2171 break;
2172 }
2173
2174 wq_prepost_put(wqp);
2175 wqp = tmp_wqp;
2176
2177 assert(wqp_type(wqp) == WQP_POST);
2178 } while (next_id != prepost_id);
2179
2180 if (ret != WQ_ITERATE_DROPPED)
2181 wq_prepost_put(wqp);
2182
2183 if (ret == WQ_ITERATE_CONTINUE)
2184 ret = WQ_ITERATE_SUCCESS;
2185 return ret;
2186 }
2187
2188
2189 struct _is_posted_ctx {
2190 struct waitq *posting_wq;
2191 int did_prepost;
2192 };
2193
2194 static int wq_is_preposted_on_set_cb(struct waitq_set *wqset, void *ctx,
2195 struct wq_prepost *wqp, struct waitq *waitq)
2196 {
2197 struct _is_posted_ctx *pctx = (struct _is_posted_ctx *)ctx;
2198
2199 (void)wqset;
2200 (void)wqp;
2201
2202 /*
2203 * Don't early-out, run through the _entire_ list:
2204 * This ensures that we retain a minimum number of invalid elements.
2205 */
2206 if (pctx->posting_wq == waitq)
2207 pctx->did_prepost = 1;
2208
2209 return WQ_ITERATE_CONTINUE;
2210 }
2211
2212
2213 /**
2214 * checks if 'waitq' has already preposted on 'wqset'
2215 *
2216 * Parameters:
2217 * waitq The waitq that's preposting
2218 * wqset The set onto which waitq may be preposted
2219 *
2220 * Conditions:
2221 * both waitq and wqset are locked
2222 *
2223 * Returns non-zero if 'waitq' has already preposted to 'wqset'
2224 */
2225 static int wq_is_preposted_on_set(struct waitq *waitq, struct waitq_set *wqset)
2226 {
2227 int ret;
2228 struct _is_posted_ctx pctx;
2229
2230 /*
2231 * If the set's only prepost matches the waitq's prepost ID,
2232 * then it obviously already preposted to the set.
2233 */
2234 if (waitq->waitq_prepost_id != 0 &&
2235 wqset->wqset_prepost_id == waitq->waitq_prepost_id)
2236 return 1;
2237
2238 /* use full prepost iteration: always trim the list */
2239 pctx.posting_wq = waitq;
2240 pctx.did_prepost = 0;
2241 ret = wq_prepost_foreach_locked(wqset, (void *)&pctx,
2242 wq_is_preposted_on_set_cb);
2243 return pctx.did_prepost;
2244 }
2245
2246 static struct wq_prepost *wq_get_prepost_obj(uint64_t *reserved, int type)
2247 {
2248 struct wq_prepost *wqp = NULL;
2249 /*
2250 * don't fail just because the caller doesn't have enough
2251 * reservations, we've kept a low-water mark on the prepost table,
2252 * so there should be some available for us.
2253 */
2254 if (reserved && *reserved) {
2255 wqp = wq_prepost_rpop(reserved, type);
2256 } else {
2257 /*
2258 * TODO: if in interrupt context, grab from a special
2259 * region / reserved list!
2260 */
2261 wqp = wq_prepost_alloc(type, 1);
2262 }
2263
2264 if (wqp == NULL)
2265 panic("Couldn't allocate prepost object!");
2266 return wqp;
2267 }
2268
2269
2270 /**
2271 * prepost a waitq onto a waitq set
2272 *
2273 * Parameters:
2274 * wqset The set onto which waitq will be preposted
2275 * waitq The waitq that's preposting
2276 * reserved List (wqt_elem_list_ style) of pre-allocated prepost elements
2277 * Could be NULL
2278 *
2279 * Conditions:
2280 * both wqset and waitq are locked
2281 *
2282 * Notes:
2283 * If reserved is NULL, this may block on prepost table growth.
2284 */
2285 static void wq_prepost_do_post_locked(struct waitq_set *wqset,
2286 struct waitq *waitq,
2287 uint64_t *reserved)
2288 {
2289 struct wq_prepost *wqp_post, *wqp_head, *wqp_tail;
2290
2291 assert(waitq_held(waitq) && waitq_held(&wqset->wqset_q));
2292
2293 /*
2294 * nothing to do if it's already preposted:
2295 * note that this also culls any invalid prepost objects
2296 */
2297 if (wq_is_preposted_on_set(waitq, wqset))
2298 return;
2299
2300 /*
2301 * This function is called because an event is being posted to 'waitq'.
2302 * We need a prepost object associated with this queue. Allocate one
2303 * now if the waitq isn't already associated with one.
2304 */
2305 if (waitq->waitq_prepost_id == 0) {
2306 struct wq_prepost *wqp;
2307 wqp = wq_get_prepost_obj(reserved, WQP_WQ);
2308 wqp->wqp_wq.wqp_wq_ptr = waitq;
2309 wqp_set_valid(wqp);
2310 waitq->waitq_prepost_id = wqp->wqp_prepostid.id;
2311 wq_prepost_put(wqp);
2312 }
2313
2314 #if CONFIG_WAITQ_STATS
2315 g_prepost_table.npreposts += 1;
2316 #endif
2317
2318 wqdbg_v("preposting waitq %p (0x%llx) to set 0x%llx",
2319 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq),
2320 waitq->waitq_prepost_id, wqset->wqset_id);
2321
2322 if (wqset->wqset_prepost_id == 0) {
2323 /* the set has no previous preposts */
2324 wqset->wqset_prepost_id = waitq->waitq_prepost_id;
2325 return;
2326 }
2327
2328 wqp_head = wq_prepost_get(wqset->wqset_prepost_id);
2329 if (!wqp_head) {
2330 /* the previous prepost has become invalid */
2331 wqset->wqset_prepost_id = waitq->waitq_prepost_id;
2332 return;
2333 }
2334
2335 assert(wqp_head->wqp_prepostid.id == wqset->wqset_prepost_id);
2336
2337 /*
2338 * If we get here, we're going to need at least one new wq_prepost
2339 * object. If the previous wqset_prepost_id points to a WQP_WQ, we
2340 * actually need to allocate 2 wq_prepost objects because the WQP_WQ
2341 * is tied to the waitq and shared across all sets.
2342 */
2343 wqp_post = wq_get_prepost_obj(reserved, WQP_POST);
2344
2345 wqp_post->wqp_post.wqp_wq_id = waitq->waitq_prepost_id;
2346 wqdbg_v("POST 0x%llx :: WQ 0x%llx", wqp_post->wqp_prepostid.id,
2347 waitq->waitq_prepost_id);
2348
2349 if (wqp_type(wqp_head) == WQP_WQ) {
2350 /*
2351 * We must replace the wqset_prepost_id with a pointer
2352 * to two new WQP_POST objects
2353 */
2354 uint64_t wqp_id = wqp_head->wqp_prepostid.id;
2355 wqdbg_v("set 0x%llx previous had 1 WQ prepost (0x%llx): "
2356 "replacing with two POST preposts",
2357 wqset->wqset_id, wqp_id);
2358
2359 /* drop the old reference */
2360 wq_prepost_put(wqp_head);
2361
2362 /* grab another new object (the 2nd of two) */
2363 wqp_head = wq_get_prepost_obj(reserved, WQP_POST);
2364
2365 /* point this one to the original WQP_WQ object */
2366 wqp_head->wqp_post.wqp_wq_id = wqp_id;
2367 wqdbg_v("POST 0x%llx :: WQ 0x%llx",
2368 wqp_head->wqp_prepostid.id, wqp_id);
2369
2370 /* link it to the new wqp_post object allocated earlier */
2371 wqp_head->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
2372 /* make the list a double-linked and circular */
2373 wq_prepost_rlink(wqp_head, wqp_post);
2374
2375 /*
2376 * Finish setting up the new prepost: point it back to the
2377 * POST object we allocated to replace the original wqset
2378 * WQ prepost object
2379 */
2380 wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
2381 wq_prepost_rlink(wqp_post, wqp_head);
2382
2383 /* mark objects valid, and reset the wqset prepost list head */
2384 wqp_set_valid(wqp_head);
2385 wqp_set_valid(wqp_post);
2386 wqset->wqset_prepost_id = wqp_head->wqp_prepostid.id;
2387
2388 /* release both references */
2389 wq_prepost_put(wqp_head);
2390 wq_prepost_put(wqp_post);
2391
2392 wqdbg_v("set 0x%llx: 0x%llx/0x%llx -> 0x%llx/0x%llx -> 0x%llx",
2393 wqset->wqset_id, wqset->wqset_prepost_id,
2394 wqp_head->wqp_prepostid.id, wqp_head->wqp_post.wqp_next_id,
2395 wqp_post->wqp_prepostid.id,
2396 wqp_post->wqp_post.wqp_next_id);
2397 return;
2398 }
2399
2400 assert(wqp_type(wqp_head) == WQP_POST);
2401
2402 /*
2403 * Add the new prepost to the end of the prepost list
2404 */
2405 wqp_tail = wq_prepost_get_rnext(wqp_head);
2406 assert(wqp_tail != NULL);
2407 assert(wqp_tail->wqp_post.wqp_next_id == wqset->wqset_prepost_id);
2408
2409 /*
2410 * link the head to the new tail
2411 * NOTE: this needs to happen first in case wqp_tail == wqp_head
2412 */
2413 wq_prepost_reset_rnext(wqp_head);
2414 wq_prepost_rlink(wqp_head, wqp_post);
2415
2416 /* point the new object to the list head, and list tail */
2417 wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
2418 wq_prepost_rlink(wqp_post, wqp_tail);
2419
2420 /* point the last item in the waitq set's list to the new object */
2421 wqp_tail->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
2422
2423 wqp_set_valid(wqp_post);
2424
2425 wq_prepost_put(wqp_head);
2426 wq_prepost_put(wqp_tail);
2427 wq_prepost_put(wqp_post);
2428
2429 wqdbg_v("set 0x%llx (wqp:0x%llx) last_prepost:0x%llx, "
2430 "new_prepost:0x%llx->0x%llx", wqset->wqset_id,
2431 wqset->wqset_prepost_id, wqp_head->wqp_prepostid.id,
2432 wqp_post->wqp_prepostid.id, wqp_post->wqp_post.wqp_next_id);
2433
2434 return;
2435 }
2436
2437
2438 /* ----------------------------------------------------------------------
2439 *
2440 * Stats collection / reporting
2441 *
2442 * ---------------------------------------------------------------------- */
2443 #if CONFIG_WAITQ_STATS
2444 static void wq_table_stats(struct wq_table *table, struct wq_table_stats *stats)
2445 {
2446 stats->version = WAITQ_STATS_VERSION;
2447 stats->table_elements = table->nelem;
2448 stats->table_used_elems = table->used_elem;
2449 stats->table_elem_sz = table->elem_sz;
2450 stats->table_slabs = table->nslabs;
2451 stats->table_slab_sz = table->slab_sz;
2452
2453 stats->table_num_allocs = table->nallocs;
2454 stats->table_num_preposts = table->npreposts;
2455 stats->table_num_reservations = table->nreservations;
2456
2457 stats->table_max_used = table->max_used;
2458 stats->table_avg_used = table->avg_used;
2459 stats->table_max_reservations = table->max_reservations;
2460 stats->table_avg_reservations = table->avg_reservations;
2461 }
2462
2463 void waitq_link_stats(struct wq_table_stats *stats)
2464 {
2465 if (!stats)
2466 return;
2467 wq_table_stats(&g_linktable, stats);
2468 }
2469
2470 void waitq_prepost_stats(struct wq_table_stats *stats)
2471 {
2472 wq_table_stats(&g_prepost_table, stats);
2473 }
2474 #endif
2475
2476
2477 /* ----------------------------------------------------------------------
2478 *
2479 * Global Wait Queues
2480 *
2481 * ---------------------------------------------------------------------- */
2482
2483 static struct waitq g_boot_waitq;
2484 static struct waitq *global_waitqs = &g_boot_waitq;
2485 static uint32_t g_num_waitqs = 1;
2486
2487 /*
2488 * Zero out the used MSBs of the event.
2489 */
2490 #define _CAST_TO_EVENT_MASK(event) ((uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
2491
2492 /*
2493 * The Jenkins "one at a time" hash.
2494 * TBD: There may be some value to unrolling here,
2495 * depending on the architecture.
2496 */
2497 static __inline__ uint32_t waitq_hash(char *key, size_t length)
2498 {
2499 uint32_t hash = 0;
2500 size_t i;
2501
2502 for (i = 0; i < length; i++) {
2503 hash += key[i];
2504 hash += (hash << 10);
2505 hash ^= (hash >> 6);
2506 }
2507
2508 hash += (hash << 3);
2509 hash ^= (hash >> 11);
2510 hash += (hash << 15);
2511
2512 hash &= (g_num_waitqs - 1);
2513 return hash;
2514 }
2515
2516 /* return a global waitq pointer corresponding to the given event */
2517 struct waitq *_global_eventq(char *event, size_t event_length)
2518 {
2519 return &global_waitqs[waitq_hash(event, event_length)];
2520 }
2521
2522 /* return an indexed global waitq pointer */
2523 struct waitq *global_waitq(int index)
2524 {
2525 return &global_waitqs[index % g_num_waitqs];
2526 }
2527
2528
2529 #if CONFIG_WAITQ_STATS
2530 /* this global is for lldb */
2531 const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
2532 struct wq_stats g_boot_stats;
2533 struct wq_stats *g_waitq_stats = &g_boot_stats;
2534
2535 static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)
2536 {
2537 uintptr_t buf[NWAITQ_BTFRAMES + skip];
2538 if (skip < 0)
2539 skip = 0;
2540 memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
2541 fastbacktrace(buf, g_nwaitq_btframes + skip);
2542 memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
2543 }
2544
2545 static __inline__ struct wq_stats *waitq_global_stats(struct waitq *waitq) {
2546 struct wq_stats *wqs;
2547 uint32_t idx;
2548
2549 if (!waitq_is_global(waitq))
2550 return NULL;
2551
2552 idx = (uint32_t)(((uintptr_t)waitq - (uintptr_t)global_waitqs) / sizeof(*waitq));
2553 assert(idx < g_num_waitqs);
2554 wqs = &g_waitq_stats[idx];
2555 return wqs;
2556 }
2557
2558 static __inline__ void waitq_stats_count_wait(struct waitq *waitq)
2559 {
2560 struct wq_stats *wqs = waitq_global_stats(waitq);
2561 if (wqs != NULL) {
2562 wqs->waits++;
2563 waitq_grab_backtrace(wqs->last_wait, 2);
2564 }
2565 }
2566
2567 static __inline__ void waitq_stats_count_wakeup(struct waitq *waitq)
2568 {
2569 struct wq_stats *wqs = waitq_global_stats(waitq);
2570 if (wqs != NULL) {
2571 wqs->wakeups++;
2572 waitq_grab_backtrace(wqs->last_wakeup, 2);
2573 }
2574 }
2575
2576 static __inline__ void waitq_stats_count_clear_wakeup(struct waitq *waitq)
2577 {
2578 struct wq_stats *wqs = waitq_global_stats(waitq);
2579 if (wqs != NULL) {
2580 wqs->wakeups++;
2581 wqs->clears++;
2582 waitq_grab_backtrace(wqs->last_wakeup, 2);
2583 }
2584 }
2585
2586 static __inline__ void waitq_stats_count_fail(struct waitq *waitq)
2587 {
2588 struct wq_stats *wqs = waitq_global_stats(waitq);
2589 if (wqs != NULL) {
2590 wqs->failed_wakeups++;
2591 waitq_grab_backtrace(wqs->last_failed_wakeup, 2);
2592 }
2593 }
2594 #else
2595 #define waitq_stats_count_wait(q) do { } while (0)
2596 #define waitq_stats_count_wakeup(q) do { } while (0)
2597 #define waitq_stats_count_clear_wakeup(q) do { } while (0)
2598 #define waitq_stats_count_fail(q) do { } while (0)
2599 #endif
2600
2601 int waitq_is_valid(struct waitq *waitq)
2602 {
2603 return (waitq != NULL) && ((waitq->waitq_type & ~1) == WQT_QUEUE);
2604 }
2605
2606 int waitq_set_is_valid(struct waitq_set *wqset)
2607 {
2608 return (wqset != NULL) && waitqs_is_set(wqset);
2609 }
2610
2611 int waitq_is_global(struct waitq *waitq)
2612 {
2613 if (waitq >= global_waitqs && waitq < global_waitqs + g_num_waitqs)
2614 return 1;
2615 return 0;
2616 }
2617
2618 int waitq_irq_safe(struct waitq *waitq)
2619 {
2620 /* global wait queues have this bit set on initialization */
2621 return waitq->waitq_irq;
2622 }
2623
2624 static uint32_t waitq_hash_size(void)
2625 {
2626 uint32_t hsize, queues;
2627
2628 if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize)))
2629 return (hsize);
2630
2631 queues = thread_max / 11;
2632 hsize = P2ROUNDUP(queues * sizeof(struct waitq), PAGE_SIZE);
2633
2634 return hsize;
2635 }
2636
2637 void waitq_bootstrap(void)
2638 {
2639 kern_return_t kret;
2640 uint32_t whsize, qsz;
2641
2642 wq_table_bootstrap();
2643 lt_init();
2644 wqp_init();
2645
2646 /*
2647 * Determine the amount of memory we're willing to reserve for
2648 * the waitqueue hash table
2649 */
2650 whsize = waitq_hash_size();
2651
2652 /* Determine the number of waitqueues we can fit. */
2653 qsz = sizeof(struct waitq);
2654 whsize = ROUNDDOWN(whsize, qsz);
2655 g_num_waitqs = whsize / qsz;
2656
2657 /*
2658 * The hash algorithm requires that this be a power of 2, so we
2659 * just mask off all the low-order bits.
2660 */
2661 for (uint32_t i = 0; i < 31; i++) {
2662 uint32_t bit = (1 << i);
2663 if ((g_num_waitqs & bit) == g_num_waitqs)
2664 break;
2665 g_num_waitqs &= ~bit;
2666 }
2667 assert(g_num_waitqs > 0);
2668
2669 /* Now determine how much memory we really need. */
2670 whsize = P2ROUNDUP(g_num_waitqs * qsz, PAGE_SIZE);
2671
2672 wqdbg("allocating %d global queues (%d bytes)", g_num_waitqs, whsize);
2673 kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&global_waitqs,
2674 whsize, 0, KMA_KOBJECT|KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
2675 if (kret != KERN_SUCCESS || global_waitqs == NULL)
2676 panic("kernel_memory_allocate() failed to alloc global_waitqs"
2677 ", error: %d, whsize: 0x%x", kret, whsize);
2678
2679 #if CONFIG_WAITQ_STATS
2680 whsize = P2ROUNDUP(g_num_waitqs * sizeof(struct wq_stats), PAGE_SIZE);
2681 kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&g_waitq_stats,
2682 whsize, 0, KMA_KOBJECT|KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
2683 if (kret != KERN_SUCCESS || global_waitqs == NULL)
2684 panic("kernel_memory_allocate() failed to alloc g_waitq_stats"
2685 ", error: %d, whsize: 0x%x", kret, whsize);
2686 memset(g_waitq_stats, 0, whsize);
2687 #endif
2688
2689 for (uint32_t i = 0; i < g_num_waitqs; i++) {
2690 waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO|SYNC_POLICY_DISABLE_IRQ);
2691 }
2692
2693
2694 waitq_set_zone = zinit(sizeof(struct waitq_set),
2695 WAITQ_SET_MAX * sizeof(struct waitq_set),
2696 sizeof(struct waitq_set),
2697 "waitq sets");
2698 zone_change(waitq_set_zone, Z_NOENCRYPT, TRUE);
2699 }
2700
2701
2702 /* ----------------------------------------------------------------------
2703 *
2704 * Wait Queue Implementation
2705 *
2706 * ---------------------------------------------------------------------- */
2707
2708 /*
2709 * Double the standard lock timeout, because wait queues tend
2710 * to iterate over a number of threads - locking each. If there is
2711 * a problem with a thread lock, it normally times out at the wait
2712 * queue level first, hiding the real problem.
2713 */
2714 /* For x86, the hardware timeout is in TSC units. */
2715 #if defined(__i386__) || defined(__x86_64__)
2716 #define hwLockTimeOut LockTimeOutTSC
2717 #else
2718 #define hwLockTimeOut LockTimeOut
2719 #endif
2720
2721 void waitq_lock(struct waitq *wq)
2722 {
2723 if (__improbable(hw_lock_to(&(wq)->waitq_interlock,
2724 hwLockTimeOut * 2) == 0)) {
2725 boolean_t wql_acquired = FALSE;
2726
2727 while (machine_timeout_suspended()) {
2728 #if defined(__i386__) || defined(__x86_64__)
2729 /*
2730 * i386/x86_64 return with preemption disabled on a
2731 * timeout for diagnostic purposes.
2732 */
2733 mp_enable_preemption();
2734 #endif
2735 wql_acquired = hw_lock_to(&(wq)->waitq_interlock,
2736 hwLockTimeOut * 2);
2737 if (wql_acquired)
2738 break;
2739 }
2740 if (wql_acquired == FALSE)
2741 panic("waitq deadlock - waitq=%p, cpu=%d\n",
2742 wq, cpu_number());
2743 }
2744 assert(waitq_held(wq));
2745 }
2746
2747 void waitq_unlock(struct waitq *wq)
2748 {
2749 assert(waitq_held(wq));
2750 hw_lock_unlock(&(wq)->waitq_interlock);
2751 }
2752
2753
2754 /**
2755 * clear the thread-related waitq state
2756 *
2757 * Conditions:
2758 * 'thread' is locked
2759 */
2760 static inline void thread_clear_waitq_state(thread_t thread)
2761 {
2762 thread->waitq = NULL;
2763 thread->wait_event = NO_EVENT64;
2764 thread->at_safe_point = FALSE;
2765 }
2766
2767
2768 typedef thread_t (*waitq_select_cb)(void *ctx, struct waitq *waitq,
2769 int is_global, thread_t thread);
2770
2771 struct waitq_select_args {
2772 /* input parameters */
2773 struct waitq *posted_waitq;
2774 struct waitq *waitq;
2775 event64_t event;
2776 waitq_select_cb select_cb;
2777 void *select_ctx;
2778
2779 uint64_t *reserved_preposts;
2780
2781 /* output parameters */
2782 queue_t threadq;
2783 int max_threads;
2784 int *nthreads;
2785 spl_t *spl;
2786 };
2787
2788 static void do_waitq_select_n_locked(struct waitq_select_args *args);
2789
2790 /**
2791 * callback invoked once for every waitq set to which a waitq belongs
2792 *
2793 * Conditions:
2794 * ctx->posted_waitq is locked
2795 * 'link' points to a valid waitq set
2796 *
2797 * Notes:
2798 * Takes the waitq set lock on the set pointed to by 'link'
2799 * Calls do_waitq_select_n_locked() which could recurse back into
2800 * this function if the waitq set is a member of other sets.
2801 * If no threads were selected, it preposts the input waitq
2802 * onto the waitq set pointed to by 'link'.
2803 */
2804 static int waitq_select_walk_cb(struct waitq *waitq, void *ctx,
2805 struct setid_link *link)
2806 {
2807 int ret = WQ_ITERATE_CONTINUE;
2808 struct waitq_select_args args = *((struct waitq_select_args *)ctx);
2809 struct waitq_set *wqset;
2810 int get_spl = 0;
2811 spl_t set_spl;
2812
2813 (void)waitq;
2814 assert(sl_type(link) == SLT_WQS);
2815
2816 wqset = link->sl_wqs.sl_set;
2817 args.waitq = &wqset->wqset_q;
2818
2819 if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q)) {
2820 get_spl = 1;
2821 set_spl = splsched();
2822 }
2823 waitq_set_lock(wqset);
2824 /*
2825 * verify that the link wasn't invalidated just before
2826 * we were able to take the lock.
2827 */
2828 if (wqset->wqset_id != link->sl_set_id.id)
2829 goto out_unlock;
2830
2831 /*
2832 * Find any threads waiting on this wait queue set,
2833 * and recurse into any waitq set to which this set belongs.
2834 */
2835 do_waitq_select_n_locked(&args);
2836
2837 if (*(args.nthreads) > 0 ||
2838 (args.threadq && !queue_empty(args.threadq))) {
2839 /* at least 1 thread was selected and returned: don't prepost */
2840 if (args.max_threads > 0 &&
2841 *(args.nthreads) >= args.max_threads) {
2842 /* break out of the setid walk */
2843 ret = WQ_ITERATE_FOUND;
2844 }
2845 goto out_unlock;
2846 } else {
2847 /*
2848 * No thread selected: prepost 'waitq' to 'wqset'
2849 * if wqset can handle preposts and the event is set to 0.
2850 * We also make sure to not post waitq sets to other sets.
2851 *
2852 * In the future, we may consider an optimization to prepost
2853 * 'args.posted_waitq' directly to 'wqset' to avoid
2854 * unnecessary data structure manipulations in the kqueue path
2855 */
2856 if (args.event == NO_EVENT64 && waitq_set_can_prepost(wqset)) {
2857 wq_prepost_do_post_locked(wqset, waitq,
2858 args.reserved_preposts);
2859 }
2860 }
2861
2862 out_unlock:
2863 waitq_set_unlock(wqset);
2864 if (get_spl)
2865 splx(set_spl);
2866 return ret;
2867 }
2868
2869 /**
2870 * generic thread selection from a waitq (and sets to which the waitq belongs)
2871 *
2872 * Conditions:
2873 * args->waitq (and args->posted_waitq) is locked
2874 *
2875 * Notes:
2876 * Uses the optional select callback function to refine the selection
2877 * of one or more threads from a waitq and any set to which the waitq
2878 * belongs. The select callback is invoked once for every thread that
2879 * is found to be waiting on the input args->waitq.
2880 *
2881 * If one or more threads are selected, this may disable interrupts.
2882 * The previous interrupt state is returned in args->spl and should
2883 * be used in a call to splx() if threads are returned to the caller.
2884 */
2885 static void do_waitq_select_n_locked(struct waitq_select_args *args)
2886 {
2887 struct waitq *waitq = args->waitq;
2888 int max_threads = args->max_threads;
2889 thread_t thread = THREAD_NULL, first_thread = THREAD_NULL;
2890 int global_q = 0;
2891 unsigned long eventmask = 0;
2892 int *nthreads = args->nthreads;
2893
2894 assert(max_threads != 0);
2895
2896 global_q = waitq_is_global(waitq);
2897 if (global_q) {
2898 eventmask = _CAST_TO_EVENT_MASK(args->event);
2899 /* make sure this waitq accepts this event mask */
2900 if ((waitq->waitq_eventmask & eventmask) != eventmask)
2901 return;
2902 eventmask = 0;
2903 }
2904
2905 /* look through each thread waiting directly on the waitq */
2906 qe_foreach_element_safe(thread, &waitq->waitq_queue, links) {
2907 thread_t t = THREAD_NULL;
2908 assert(thread->waitq == waitq);
2909 if (thread->wait_event == args->event) {
2910 t = thread;
2911 if (first_thread == THREAD_NULL)
2912 first_thread = thread;
2913
2914 /* allow the caller to futher refine the selection */
2915 if (args->select_cb)
2916 t = args->select_cb(args->select_ctx, waitq,
2917 global_q, thread);
2918 if (t != THREAD_NULL) {
2919 *nthreads += 1;
2920 if (args->threadq) {
2921 if (*nthreads == 1)
2922 *(args->spl) = splsched();
2923 thread_lock(t);
2924 thread_clear_waitq_state(t);
2925 /* put locked thread on output queue */
2926 re_queue_tail(args->threadq, &t->links);
2927 }
2928 /* only enqueue up to 'max' threads */
2929 if (*nthreads >= max_threads && max_threads > 0)
2930 break;
2931 }
2932 }
2933 /* thread wasn't selected, and the waitq is global */
2934 if (t == THREAD_NULL && global_q)
2935 eventmask |= _CAST_TO_EVENT_MASK(thread->wait_event);
2936 }
2937
2938 /*
2939 * Update the eventmask of global queues:
2940 * - If we selected all the threads in the queue, or we selected zero
2941 * threads on the queue, set the eventmask to the calculated value
2942 * (potentially 0 if we selected them all)
2943 * - If we just pulled out a subset of threads from the queue, then we
2944 * can't assume the calculated mask is complete (because we may not
2945 * have made it through all the threads in the queue), so we have to
2946 * leave it alone.
2947 */
2948 if (global_q && (queue_empty(&waitq->waitq_queue) || *nthreads == 0))
2949 waitq->waitq_eventmask = (typeof(waitq->waitq_eventmask))eventmask;
2950
2951 /*
2952 * Grab the first thread in the queue if no other thread was selected.
2953 * We can guarantee that no one has manipulated this thread because
2954 * it's waiting on the given waitq, and we have that waitq locked.
2955 */
2956 if (*nthreads == 0 && first_thread != THREAD_NULL && args->threadq) {
2957 /* we know this is the first (and only) thread */
2958 ++(*nthreads);
2959 *(args->spl) = splsched();
2960 thread_lock(first_thread);
2961 thread_clear_waitq_state(first_thread);
2962 re_queue_tail(args->threadq, &first_thread->links);
2963
2964 /* update the eventmask on global queues */
2965 if (global_q && queue_empty(&waitq->waitq_queue))
2966 waitq->waitq_eventmask = 0;
2967 }
2968
2969 if (max_threads > 0 && *nthreads >= max_threads)
2970 return;
2971
2972 /*
2973 * wait queues that are not in any sets
2974 * are the bottom of the recursion
2975 */
2976 if (!waitq->waitq_set_id)
2977 return;
2978
2979 /* check to see if the set ID for this wait queue is valid */
2980 struct setid_link *link = lt_get_link(waitq->waitq_set_id);
2981 if (!link) {
2982 /* the waitq set to which this waitq belonged, has been invalidated */
2983 waitq->waitq_set_id = 0;
2984 return;
2985 }
2986
2987 lt_put_link(link);
2988
2989 /*
2990 * If this waitq is a member of any wait queue sets, we need to look
2991 * for waiting thread(s) in any of those sets, and prepost all sets that
2992 * don't have active waiters.
2993 *
2994 * Note that we do a local walk of this waitq's links - we manually
2995 * recurse down wait queue set's with non-zero wqset_q.waitq_set_id
2996 */
2997 (void)walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
2998 SLT_WQS, (void *)args, waitq_select_walk_cb);
2999 }
3000
3001 /**
3002 * main entry point for thread selection from a waitq
3003 *
3004 * Conditions:
3005 * waitq is locked
3006 *
3007 * Returns:
3008 * The number of threads waiting on 'waitq' for 'event' which have
3009 * been placed onto the input 'threadq'
3010 *
3011 * Notes:
3012 * The 'select_cb' function is invoked for every thread found waiting
3013 * on 'waitq' for 'event'. The thread is _not_ locked upon callback
3014 * invocation. This parameter may be NULL.
3015 *
3016 * If one or more threads are returned in 'threadq' then the caller is
3017 * responsible to call splx() using the returned 'spl' value. Each
3018 * returned thread is locked.
3019 */
3020 static __inline__ int waitq_select_n_locked(struct waitq *waitq,
3021 event64_t event,
3022 waitq_select_cb select_cb,
3023 void *select_ctx,
3024 uint64_t *reserved_preposts,
3025 queue_t threadq,
3026 int max_threads, spl_t *spl)
3027 {
3028 int nthreads = 0;
3029
3030 struct waitq_select_args args = {
3031 .posted_waitq = waitq,
3032 .waitq = waitq,
3033 .event = event,
3034 .select_cb = select_cb,
3035 .select_ctx = select_ctx,
3036 .reserved_preposts = reserved_preposts,
3037 .threadq = threadq,
3038 .max_threads = max_threads,
3039 .nthreads = &nthreads,
3040 .spl = spl,
3041 };
3042
3043 do_waitq_select_n_locked(&args);
3044 return nthreads;
3045 }
3046
3047
3048 /**
3049 * callback function that uses thread parameters to determine wakeup eligibility
3050 *
3051 * Conditions:
3052 * 'waitq' is locked
3053 * 'thread' is not locked
3054 */
3055 static thread_t waitq_select_one_cb(void *ctx, struct waitq *waitq,
3056 int is_global, thread_t thread)
3057 {
3058 int fifo_q, realtime;
3059 boolean_t thread_imp_donor = FALSE;
3060
3061 (void)ctx;
3062 (void)waitq;
3063 (void)is_global;
3064 realtime = 0;
3065
3066 fifo_q = 1; /* default to FIFO for all queues for now */
3067 #if IMPORTANCE_INHERITANCE
3068 if (is_global)
3069 fifo_q = 0; /* 'thread_imp_donor' takes the place of FIFO checking */
3070 #endif
3071
3072 if (thread->sched_pri >= BASEPRI_REALTIME)
3073 realtime = 1;
3074
3075 #if IMPORTANCE_INHERITANCE
3076 /*
3077 * Checking imp donor bit does not need thread lock or
3078 * or task lock since we have the wait queue lock and
3079 * thread can not be removed from it without acquiring
3080 * wait queue lock. The imp donor bit may change
3081 * once we read its value, but it is ok to wake
3082 * a thread while someone drops importance assertion
3083 * on the that thread.
3084 */
3085 thread_imp_donor = task_is_importance_donor(thread->task);
3086 #endif /* IMPORTANCE_INHERITANCE */
3087
3088 if (fifo_q || thread_imp_donor == TRUE
3089 || realtime || (thread->options & TH_OPT_VMPRIV)) {
3090 /*
3091 * If this thread's task is an importance donor,
3092 * or it's a realtime thread, or it's a VM privileged
3093 * thread, OR the queue is marked as FIFO:
3094 * select the thread
3095 */
3096 return thread;
3097 }
3098
3099 /* by default, _don't_ select the thread */
3100 return THREAD_NULL;
3101 }
3102
3103 /**
3104 * select a single thread from a waitq that's waiting for a given event
3105 *
3106 * Conditions:
3107 * 'waitq' is locked
3108 *
3109 * Returns:
3110 * A locked thread that's been removed from the waitq, but has not
3111 * yet been put on a run queue. Caller is responsible to call splx
3112 * with the '*spl' value.
3113 */
3114 static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,
3115 uint64_t *reserved_preposts,
3116 int priority, spl_t *spl)
3117 {
3118 int nthreads;
3119 queue_head_t threadq;
3120
3121 (void)priority;
3122
3123 queue_init(&threadq);
3124
3125 nthreads = waitq_select_n_locked(waitq, event, waitq_select_one_cb, NULL,
3126 reserved_preposts, &threadq, 1, spl);
3127
3128 /* if we selected a thread, return it (still locked) */
3129 if (!queue_empty(&threadq)) {
3130 thread_t t;
3131 queue_entry_t qe = dequeue_head(&threadq);
3132 t = qe_element(qe, struct thread, links);
3133 assert(queue_empty(&threadq)); /* there should be 1 entry */
3134 /* t has been locked and removed from all queues */
3135 return t;
3136 }
3137
3138 return THREAD_NULL;
3139 }
3140
3141
3142 struct select_thread_ctx {
3143 thread_t thread;
3144 event64_t event;
3145 spl_t *spl;
3146 };
3147
3148 /**
3149 * link walk callback invoked once for each set to which a waitq belongs
3150 *
3151 * Conditions:
3152 * initial waitq is locked
3153 * ctx->thread is unlocked
3154 *
3155 * Notes:
3156 * This may disable interrupts and early-out of the full DAG link walk by
3157 * returning KERN_ALREADY_IN_SET. In this case, the returned thread has
3158 * been removed from the waitq, it's waitq state has been reset, and the
3159 * caller is responsible to call splx() with the returned interrupt state
3160 * in ctx->spl.
3161 */
3162 static int waitq_select_thread_cb(struct waitq *waitq, void *ctx,
3163 struct setid_link *link)
3164 {
3165 struct select_thread_ctx *stctx = (struct select_thread_ctx *)ctx;
3166 struct waitq_set *wqset;
3167
3168 (void)waitq;
3169
3170 thread_t thread = stctx->thread;
3171 event64_t event = stctx->event;
3172
3173 if (sl_type(link) != SLT_WQS)
3174 return WQ_ITERATE_CONTINUE;
3175
3176 wqset = link->sl_wqs.sl_set;
3177
3178 if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q)) {
3179 *(stctx->spl) = splsched();
3180 waitq_set_lock(wqset);
3181 thread_lock(thread);
3182 } else {
3183 waitq_set_lock(wqset);
3184 *(stctx->spl) = splsched();
3185 thread_lock(thread);
3186 }
3187
3188 if ((thread->waitq == &wqset->wqset_q)
3189 && (thread->wait_event == event)) {
3190 remqueue(&thread->links);
3191 thread_clear_waitq_state(thread);
3192 /*
3193 * thread still locked,
3194 * return non-zero to break out of WQS walk
3195 */
3196 waitq_set_unlock(wqset);
3197 return WQ_ITERATE_FOUND;
3198 }
3199
3200 thread_unlock(thread);
3201 waitq_set_unlock(wqset);
3202 splx(*(stctx->spl));
3203
3204 return WQ_ITERATE_CONTINUE;
3205 }
3206
3207 /**
3208 * returns KERN_SUCCESS and locks 'thread' if-and-only-if 'thread' is waiting
3209 * on 'waitq' (or any set to which waitq belongs) for 'event'
3210 *
3211 * Conditions:
3212 * 'waitq' is locked
3213 * 'thread' is unlocked
3214 */
3215 static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
3216 event64_t event,
3217 thread_t thread, spl_t *spl)
3218 {
3219 struct setid_link *link;
3220 struct select_thread_ctx ctx;
3221 kern_return_t kr;
3222
3223 *spl = splsched();
3224 thread_lock(thread);
3225
3226 if ((thread->waitq == waitq) && (thread->wait_event == event)) {
3227 remqueue(&thread->links);
3228 thread_clear_waitq_state(thread);
3229 /* thread still locked */
3230 return KERN_SUCCESS;
3231 }
3232
3233 thread_unlock(thread);
3234 splx(*spl);
3235
3236 if (!waitq->waitq_set_id)
3237 return KERN_NOT_WAITING;
3238
3239 /* check to see if the set ID for this wait queue is valid */
3240 link = lt_get_link(waitq->waitq_set_id);
3241 if (!link) {
3242 /* the waitq to which this set belonged, has been invalidated */
3243 waitq->waitq_set_id = 0;
3244 return KERN_NOT_WAITING;
3245 }
3246
3247 /*
3248 * The thread may be waiting on a wait queue set to which
3249 * the input 'waitq' belongs. Go look for the thread in
3250 * all wait queue sets. If it's there, we'll remove it
3251 * because it's equivalent to waiting directly on the input waitq.
3252 */
3253 ctx.thread = thread;
3254 ctx.event = event;
3255 ctx.spl = spl;
3256 kr = walk_setid_links(LINK_WALK_FULL_DAG, waitq, waitq->waitq_set_id,
3257 SLT_WQS, (void *)&ctx, waitq_select_thread_cb);
3258
3259 lt_put_link(link);
3260
3261 /* we found a thread, return success */
3262 if (kr == WQ_ITERATE_FOUND)
3263 return KERN_SUCCESS;
3264
3265 return KERN_NOT_WAITING;
3266 }
3267
3268 static int prepost_exists_cb(struct waitq_set __unused *wqset,
3269 void __unused *ctx,
3270 struct wq_prepost __unused *wqp,
3271 struct waitq __unused *waitq)
3272 {
3273 /* if we get here, then we know that there is a valid prepost object! */
3274 return WQ_ITERATE_FOUND;
3275 }
3276
3277 /**
3278 * declare a thread's intent to wait on 'waitq' for 'wait_event'
3279 *
3280 * Conditions:
3281 * 'waitq' is locked
3282 * 'thread' is locked
3283 */
3284 wait_result_t waitq_assert_wait64_locked(struct waitq *waitq,
3285 event64_t wait_event,
3286 wait_interrupt_t interruptible,
3287 wait_timeout_urgency_t urgency,
3288 uint64_t deadline,
3289 uint64_t leeway,
3290 thread_t thread)
3291 {
3292 wait_result_t wait_result;
3293 int realtime = 0;
3294
3295 /*
3296 * Warning: Do _not_ place debugging print statements here.
3297 * The thread is locked!
3298 */
3299
3300 if (thread->waitq != NULL)
3301 panic("thread already waiting on %p", thread->waitq);
3302
3303 if (waitq_is_set(waitq)) {
3304 struct waitq_set *wqset = (struct waitq_set *)waitq;
3305 /*
3306 * early-out if the thread is waiting on a wait queue set
3307 * that has already been pre-posted.
3308 */
3309 if (wait_event == NO_EVENT64 && waitq_set_maybe_preposted(wqset)) {
3310 int ret;
3311 /*
3312 * Run through the list of potential preposts. Because
3313 * this is a hot path, we short-circuit the iteration
3314 * if we find just one prepost object.
3315 */
3316 ret = wq_prepost_foreach_locked(wqset, NULL,
3317 prepost_exists_cb);
3318 if (ret == WQ_ITERATE_FOUND) {
3319 thread->wait_result = THREAD_AWAKENED;
3320 return THREAD_AWAKENED;
3321 }
3322 }
3323 }
3324
3325 /*
3326 * Realtime threads get priority for wait queue placements.
3327 * This allows wait_queue_wakeup_one to prefer a waiting
3328 * realtime thread, similar in principle to performing
3329 * a wait_queue_wakeup_all and allowing scheduler prioritization
3330 * to run the realtime thread, but without causing the
3331 * lock contention of that scenario.
3332 */
3333 if (thread->sched_pri >= BASEPRI_REALTIME)
3334 realtime = 1;
3335
3336 /*
3337 * This is the extent to which we currently take scheduling attributes
3338 * into account. If the thread is vm priviledged, we stick it at
3339 * the front of the queue. Later, these queues will honor the policy
3340 * value set at waitq_init time.
3341 */
3342 wait_result = thread_mark_wait_locked(thread, interruptible);
3343 /* thread->wait_result has been set */
3344 if (wait_result == THREAD_WAITING) {
3345 if (!waitq->waitq_fifo
3346 || (thread->options & TH_OPT_VMPRIV) || realtime)
3347 enqueue_head(&waitq->waitq_queue, &thread->links);
3348 else
3349 enqueue_tail(&waitq->waitq_queue, &thread->links);
3350
3351 thread->wait_event = wait_event;
3352 thread->waitq = waitq;
3353
3354 if (deadline != 0) {
3355 boolean_t act;
3356 act = timer_call_enter_with_leeway(&thread->wait_timer,
3357 NULL,
3358 deadline, leeway,
3359 urgency, FALSE);
3360 if (!act)
3361 thread->wait_timer_active++;
3362 thread->wait_timer_is_set = TRUE;
3363 }
3364
3365 if (waitq_is_global(waitq))
3366 waitq->waitq_eventmask = waitq->waitq_eventmask
3367 | _CAST_TO_EVENT_MASK(wait_event);
3368
3369 waitq_stats_count_wait(waitq);
3370 }
3371
3372 return wait_result;
3373 }
3374
3375 /**
3376 * remove 'thread' from its current blocking state on 'waitq'
3377 *
3378 * Conditions:
3379 * 'waitq' is locked
3380 * 'thread' is locked
3381 *
3382 * Notes:
3383 * This function is primarily used by clear_wait_internal in
3384 * sched_prim.c from the thread timer wakeup path
3385 * (i.e. the thread was waiting on 'waitq' with a timeout that expired)
3386 */
3387 void waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
3388 {
3389 (void)waitq;
3390 assert(thread->waitq == waitq);
3391
3392 remqueue(&thread->links);
3393 thread_clear_waitq_state(thread);
3394 waitq_stats_count_clear_wakeup(waitq);
3395
3396 /* clear the global event mask if this was the last thread there! */
3397 if (waitq_is_global(waitq) && queue_empty(&waitq->waitq_queue))
3398 waitq->waitq_eventmask = 0;
3399 }
3400
3401
3402 static __inline__
3403 void maybe_adjust_thread_pri(thread_t thread, int priority) {
3404 if (thread->sched_pri < priority) {
3405 if (priority <= MAXPRI) {
3406 set_sched_pri(thread, priority);
3407
3408 thread->was_promoted_on_wakeup = 1;
3409 thread->sched_flags |= TH_SFLAG_PROMOTED;
3410 }
3411 return;
3412 }
3413
3414 /*
3415 * If the caller is requesting the waitq subsystem to promote the
3416 * priority of the awoken thread, then boost the thread's priority to
3417 * the default WAITQ_BOOST_PRIORITY (if it's not already equal or
3418 * higher priority). This boost must be removed via a call to
3419 * waitq_clear_promotion_locked.
3420 */
3421 if (priority == WAITQ_PROMOTE_PRIORITY &&
3422 (thread->sched_pri < WAITQ_BOOST_PRIORITY ||
3423 !(thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED))) {
3424
3425 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_PROMOTE) | DBG_FUNC_NONE,
3426 (uintptr_t)thread_tid(thread),
3427 thread->sched_pri, thread->base_pri,
3428 WAITQ_BOOST_PRIORITY, 0);
3429 thread->sched_flags |= TH_SFLAG_WAITQ_PROMOTED;
3430 if (thread->sched_pri < WAITQ_BOOST_PRIORITY)
3431 set_sched_pri(thread, WAITQ_BOOST_PRIORITY);
3432 }
3433 }
3434
3435 /**
3436 * Clear a thread's waitq priority promotion state and the waitq's boost flag
3437 *
3438 * This function will always clear the waitq's 'waitq_boost' flag. If the
3439 * 'thread' parameter is non-null, the this function will also check the
3440 * priority promotion (boost) state of that thread. If this thread was boosted
3441 * (by having been awoken from a boosting waitq), then this boost state is
3442 * cleared. This function is to be paired with waitq_enable_promote_locked.
3443 */
3444 void waitq_clear_promotion_locked(struct waitq *waitq, thread_t thread)
3445 {
3446 spl_t s;
3447
3448 assert(waitq_held(waitq));
3449 if (thread == THREAD_NULL)
3450 return;
3451
3452 if (!waitq_irq_safe(waitq))
3453 s = splsched();
3454 thread_lock(thread);
3455
3456 if (thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) {
3457 thread->sched_flags &= ~TH_SFLAG_WAITQ_PROMOTED;
3458
3459 if (thread->sched_flags & TH_SFLAG_PROMOTED_MASK) {
3460 /* it still has other promotions (mutex/rw_lock) */
3461 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
3462 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE) | DBG_FUNC_NONE,
3463 (uintptr_t)thread_tid(thread),
3464 thread->sched_pri,
3465 thread->base_pri,
3466 DEPRESSPRI, 0);
3467 set_sched_pri(thread, DEPRESSPRI);
3468 } else {
3469 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE) | DBG_FUNC_NONE,
3470 (uintptr_t)thread_tid(thread),
3471 thread->sched_pri,
3472 thread->base_pri,
3473 thread->base_pri, 0);
3474 thread_recompute_sched_pri(thread, FALSE);
3475 }
3476 }
3477
3478 thread_unlock(thread);
3479 if (!waitq_irq_safe(waitq))
3480 splx(s);
3481 }
3482
3483 /**
3484 * wakeup all threads waiting on 'waitq' for 'wake_event'
3485 *
3486 * Conditions:
3487 * 'waitq' is locked
3488 *
3489 * Notes:
3490 * May temporarily disable and re-enable interrupts
3491 * and re-adjust thread priority of each awoken thread.
3492 *
3493 * If the input 'lock_state' == WAITQ_UNLOCK then the waitq will have
3494 * been unlocked before calling thread_go() on any returned threads, and
3495 * is guaranteed to be unlocked upon function return.
3496 */
3497 kern_return_t waitq_wakeup64_all_locked(struct waitq *waitq,
3498 event64_t wake_event,
3499 wait_result_t result,
3500 uint64_t *reserved_preposts,
3501 int priority,
3502 waitq_lock_state_t lock_state)
3503 {
3504 kern_return_t ret;
3505 thread_t thread;
3506 spl_t th_spl;
3507 int nthreads;
3508 queue_head_t wakeup_queue;
3509
3510 assert(waitq_held(waitq));
3511 queue_init(&wakeup_queue);
3512
3513 nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
3514 reserved_preposts,
3515 &wakeup_queue, -1, &th_spl);
3516
3517 /* set each thread running */
3518 ret = KERN_NOT_WAITING;
3519
3520 #if CONFIG_WAITQ_STATS
3521 qe_foreach_element(thread, &wakeup_queue, links)
3522 waitq_stats_count_wakeup(waitq);
3523 #endif
3524 if (lock_state == WAITQ_UNLOCK)
3525 waitq_unlock(waitq);
3526
3527 qe_foreach_element_safe(thread, &wakeup_queue, links) {
3528 remqueue(&thread->links);
3529 maybe_adjust_thread_pri(thread, priority);
3530 ret = thread_go(thread, result);
3531 assert(ret == KERN_SUCCESS);
3532 thread_unlock(thread);
3533 }
3534 if (nthreads > 0)
3535 splx(th_spl);
3536 else
3537 waitq_stats_count_fail(waitq);
3538
3539 return ret;
3540 }
3541
3542 /**
3543 * wakeup one thread waiting on 'waitq' for 'wake_event'
3544 *
3545 * Conditions:
3546 * 'waitq' is locked
3547 *
3548 * Notes:
3549 * May temporarily disable and re-enable interrupts.
3550 */
3551 kern_return_t waitq_wakeup64_one_locked(struct waitq *waitq,
3552 event64_t wake_event,
3553 wait_result_t result,
3554 uint64_t *reserved_preposts,
3555 int priority,
3556 waitq_lock_state_t lock_state)
3557 {
3558 thread_t thread;
3559 spl_t th_spl;
3560
3561 assert(waitq_held(waitq));
3562
3563 thread = waitq_select_one_locked(waitq, wake_event,
3564 reserved_preposts,
3565 priority, &th_spl);
3566
3567 if (thread != THREAD_NULL)
3568 waitq_stats_count_wakeup(waitq);
3569 else
3570 waitq_stats_count_fail(waitq);
3571
3572 if (lock_state == WAITQ_UNLOCK)
3573 waitq_unlock(waitq);
3574
3575 if (thread != THREAD_NULL) {
3576 maybe_adjust_thread_pri(thread, priority);
3577 kern_return_t ret = thread_go(thread, result);
3578 assert(ret == KERN_SUCCESS);
3579 thread_unlock(thread);
3580 splx(th_spl);
3581 return ret;
3582 }
3583
3584 return KERN_NOT_WAITING;
3585 }
3586
3587 /**
3588 * wakeup one thread waiting on 'waitq' for 'wake_event'
3589 *
3590 * Conditions:
3591 * 'waitq' is locked
3592 *
3593 * Returns:
3594 * A locked, runnable thread.
3595 * If return value is non-NULL, interrupts have also
3596 * been disabled, and the caller is responsible to call
3597 * splx() with the returned '*spl' value.
3598 */
3599 thread_t waitq_wakeup64_identity_locked(struct waitq *waitq,
3600 event64_t wake_event,
3601 wait_result_t result,
3602 spl_t *spl,
3603 uint64_t *reserved_preposts,
3604 waitq_lock_state_t lock_state)
3605 {
3606 thread_t thread;
3607
3608 assert(waitq_held(waitq));
3609
3610 thread = waitq_select_one_locked(waitq, wake_event,
3611 reserved_preposts,
3612 WAITQ_ALL_PRIORITIES, spl);
3613
3614 if (thread != THREAD_NULL)
3615 waitq_stats_count_wakeup(waitq);
3616 else
3617 waitq_stats_count_fail(waitq);
3618
3619 if (lock_state == WAITQ_UNLOCK)
3620 waitq_unlock(waitq);
3621
3622 if (thread != THREAD_NULL) {
3623 kern_return_t __assert_only ret;
3624 ret = thread_go(thread, result);
3625 assert(ret == KERN_SUCCESS);
3626 }
3627
3628 return thread; /* locked if not NULL (caller responsible for spl) */
3629 }
3630
3631 /**
3632 * wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
3633 *
3634 * Conditions:
3635 * 'waitq' is locked
3636 * 'thread' is unlocked
3637 *
3638 * Notes:
3639 * May temporarily disable and re-enable interrupts
3640 *
3641 * If the input lock_state == WAITQ_UNLOCK then the waitq will have been
3642 * unlocked before calling thread_go() if 'thread' is to be awoken, and
3643 * is guaranteed to be unlocked upon function return.
3644 */
3645 kern_return_t waitq_wakeup64_thread_locked(struct waitq *waitq,
3646 event64_t wake_event,
3647 thread_t thread,
3648 wait_result_t result,
3649 waitq_lock_state_t lock_state)
3650 {
3651 kern_return_t ret;
3652 spl_t th_spl;
3653
3654 assert(waitq_held(waitq));
3655
3656 /*
3657 * See if the thread was still waiting there. If so, it got
3658 * dequeued and returned locked.
3659 */
3660 ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
3661
3662 if (ret == KERN_SUCCESS)
3663 waitq_stats_count_wakeup(waitq);
3664 else
3665 waitq_stats_count_fail(waitq);
3666
3667 if (lock_state == WAITQ_UNLOCK)
3668 waitq_unlock(waitq);
3669
3670 if (ret != KERN_SUCCESS)
3671 return KERN_NOT_WAITING;
3672
3673 ret = thread_go(thread, result);
3674 assert(ret == KERN_SUCCESS);
3675 thread_unlock(thread);
3676 splx(th_spl);
3677
3678 return ret;
3679 }
3680
3681
3682
3683 /* ----------------------------------------------------------------------
3684 *
3685 * In-Kernel API
3686 *
3687 * ---------------------------------------------------------------------- */
3688
3689 /**
3690 * initialize a waitq object
3691 */
3692 kern_return_t waitq_init(struct waitq *waitq, int policy)
3693 {
3694 assert(waitq != NULL);
3695
3696 /* only FIFO and LIFO for now */
3697 if ((policy & SYNC_POLICY_FIXED_PRIORITY) != 0)
3698 return KERN_INVALID_ARGUMENT;
3699
3700 waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
3701 waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
3702 waitq->waitq_prepost = 0;
3703 waitq->waitq_type = WQT_QUEUE;
3704 waitq->waitq_eventmask = 0;
3705
3706 waitq->waitq_set_id = 0;
3707 waitq->waitq_prepost_id = 0;
3708
3709 hw_lock_init(&waitq->waitq_interlock);
3710 queue_init(&waitq->waitq_queue);
3711
3712 return KERN_SUCCESS;
3713 }
3714
3715 struct wq_unlink_ctx {
3716 struct waitq *unlink_wq;
3717 struct waitq_set *unlink_wqset;
3718 };
3719
3720 static int waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx,
3721 struct wq_prepost *wqp, struct waitq *waitq);
3722
3723 /**
3724 * walk_setid_links callback to invalidate 'link' parameter
3725 *
3726 * Conditions:
3727 * Called from walk_setid_links.
3728 * Note that unlink other callbacks, this one make no assumptions about
3729 * the 'waitq' parameter, specifically it does not have to be locked or
3730 * even valid.
3731 */
3732 static int waitq_unlink_all_cb(struct waitq *waitq, void *ctx,
3733 struct setid_link *link)
3734 {
3735 (void)waitq;
3736 (void)ctx;
3737 if (sl_type(link) == SLT_LINK && sl_is_valid(link))
3738 lt_invalidate(link);
3739
3740 if (sl_type(link) == SLT_WQS) {
3741 struct waitq_set *wqset;
3742 int do_spl = 0;
3743 spl_t spl;
3744 struct wq_unlink_ctx ulctx;
3745
3746 /*
3747 * When destroying the waitq, take the time to clear out any
3748 * preposts it may have made. This could potentially save time
3749 * on the IPC send path which would otherwise have to iterate
3750 * over lots of dead port preposts.
3751 */
3752 if (waitq->waitq_prepost_id == 0)
3753 goto out;
3754
3755 wqset = link->sl_wqs.sl_set;
3756 assert(wqset != NULL);
3757
3758 if (waitq_set_is_valid(wqset) &&
3759 waitq_irq_safe(&wqset->wqset_q)) {
3760 spl = splsched();
3761 do_spl = 1;
3762 }
3763 waitq_set_lock(wqset);
3764
3765 if (!waitq_set_is_valid(wqset)) {
3766 /* someone raced us to teardown */
3767 goto out_unlock;
3768 }
3769 if (!waitq_set_maybe_preposted(wqset))
3770 goto out_unlock;
3771
3772 ulctx.unlink_wq = waitq;
3773 ulctx.unlink_wqset = wqset;
3774 (void)wq_prepost_iterate(wqset->wqset_prepost_id, &ulctx,
3775 waitq_unlink_prepost_cb);
3776 out_unlock:
3777 waitq_set_unlock(wqset);
3778 if (do_spl)
3779 splx(spl);
3780 }
3781
3782 out:
3783 return WQ_ITERATE_CONTINUE;
3784 }
3785
3786
3787 /**
3788 * cleanup any link/prepost table resources associated with a waitq
3789 */
3790 void waitq_deinit(struct waitq *waitq)
3791 {
3792 uint64_t setid = 0;
3793 spl_t s;
3794
3795 if (!waitq_valid(waitq))
3796 return;
3797
3798 if (waitq_irq_safe(waitq))
3799 s = splsched();
3800 waitq_lock(waitq);
3801 if (!waitq_valid(waitq))
3802 goto out;
3803
3804 waitq_unlink_all_locked(waitq, &setid, &s, NULL);
3805 waitq->waitq_type = WQT_INVALID;
3806 assert(queue_empty(&waitq->waitq_queue));
3807
3808 out:
3809 waitq_unlock(waitq);
3810 if (waitq_irq_safe(waitq))
3811 splx(s);
3812
3813 if (setid)
3814 (void)walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, setid,
3815 SLT_ALL, NULL, waitq_unlink_all_cb);
3816 }
3817
3818
3819 /**
3820 * invalidate the given wq_prepost object
3821 *
3822 * Conditions:
3823 * Called from wq_prepost_iterate (_not_ from wq_prepost_foreach_locked!)
3824 */
3825 static int wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
3826 void __unused *ctx,
3827 struct wq_prepost *wqp,
3828 struct waitq __unused *waitq)
3829 {
3830 if (wqp_type(wqp) == WQP_POST)
3831 wq_prepost_invalidate(wqp);
3832 return WQ_ITERATE_CONTINUE;
3833 }
3834
3835
3836 /**
3837 * allocate and initialize a waitq set object
3838 *
3839 * Conditions:
3840 * may block
3841 *
3842 * Returns:
3843 * allocated / initialized waitq_set object
3844 * NULL on failure
3845 */
3846 struct waitq_set *waitq_set_alloc(int policy)
3847 {
3848 struct waitq_set *wqset;
3849
3850 wqset = (struct waitq_set *)zalloc(waitq_set_zone);
3851 if (!wqset)
3852 panic("Can't allocate a new waitq set from zone %p", waitq_set_zone);
3853
3854 kern_return_t ret;
3855 ret = waitq_set_init(wqset, policy, NULL);
3856 if (ret != KERN_SUCCESS) {
3857 zfree(waitq_set_zone, wqset);
3858 wqset = NULL;
3859 }
3860
3861 return wqset;
3862 }
3863
3864 /**
3865 * initialize a waitq set object
3866 *
3867 * Conditions:
3868 * may (rarely) block if link table needs to grow, and
3869 * no 'reserved_link' object is passed.
3870 */
3871 kern_return_t waitq_set_init(struct waitq_set *wqset,
3872 int policy, uint64_t *reserved_link)
3873 {
3874 struct setid_link *link;
3875 kern_return_t ret;
3876
3877 memset(wqset, 0, sizeof(*wqset));
3878
3879 ret = waitq_init(&wqset->wqset_q, policy);
3880 if (ret != KERN_SUCCESS)
3881 return ret;
3882
3883 wqset->wqset_q.waitq_type = WQT_SET;
3884 if (policy & SYNC_POLICY_PREPOST)
3885 wqset->wqset_q.waitq_prepost = 1;
3886 else
3887 wqset->wqset_q.waitq_prepost = 0;
3888
3889 if (reserved_link && *reserved_link != 0) {
3890 link = lt_get_reserved(*reserved_link, SLT_WQS);
3891 /* always consume the caller's reference */
3892 *reserved_link = 0;
3893 } else {
3894 link = lt_alloc_link(SLT_WQS);
3895 }
3896 if (!link)
3897 panic("Can't allocate link object for waitq set: %p", wqset);
3898
3899 link->sl_wqs.sl_set = wqset;
3900 sl_set_valid(link);
3901
3902 wqset->wqset_id = link->sl_set_id.id;
3903 wqset->wqset_prepost_id = 0;
3904 lt_put_link(link);
3905
3906 return KERN_SUCCESS;
3907 }
3908
3909 /**
3910 * clear out / release any resources associated with a waitq set
3911 *
3912 * Conditions:
3913 * may block
3914 * Note:
3915 * This will render the waitq set invalid, and it must
3916 * be re-initialized with waitq_set_init before it can be used again
3917 */
3918 void waitq_set_deinit(struct waitq_set *wqset)
3919 {
3920 struct setid_link *link = NULL;
3921 uint64_t set_id, set_links_id, prepost_id;
3922 int do_spl = 0;
3923 spl_t s;
3924
3925 if (!waitqs_is_set(wqset))
3926 panic("trying to de-initialize an invalid wqset @%p", wqset);
3927
3928 if (waitq_irq_safe(&wqset->wqset_q)) {
3929 s = splsched();
3930 do_spl = 1;
3931 }
3932 waitq_set_lock(wqset);
3933
3934 set_id = wqset->wqset_id;
3935
3936 /* grab the set's link object */
3937 link = lt_get_link(set_id);
3938 if (link)
3939 lt_invalidate(link);
3940
3941 /* someone raced us to deinit */
3942 if (!link || wqset->wqset_id != set_id || set_id != link->sl_set_id.id) {
3943 if (link)
3944 lt_put_link(link);
3945 waitq_set_unlock(wqset);
3946 if (do_spl)
3947 splx(s);
3948 return;
3949 }
3950
3951 /* every wait queue set should have a valid link object */
3952 assert(link != NULL && sl_type(link) == SLT_WQS);
3953
3954 wqset->wqset_id = 0;
3955
3956 wqset->wqset_q.waitq_type = WQT_INVALID;
3957 wqset->wqset_q.waitq_fifo = 0;
3958 wqset->wqset_q.waitq_prepost = 0;
3959 /* don't clear the 'waitq_irq' bit: it's used in locking! */
3960 wqset->wqset_q.waitq_eventmask = 0;
3961
3962 /*
3963 * This set may have a lot of preposts, or may have been a member of
3964 * many other sets. To minimize spinlock hold times, we clear out the
3965 * waitq set data structure under the lock-hold, but don't clear any
3966 * table objects. We keep handles to the prepost and set linkage
3967 * objects and free those outside the critical section.
3968 */
3969 prepost_id = wqset->wqset_prepost_id;
3970 wqset->wqset_prepost_id = 0;
3971
3972 set_links_id = 0;
3973 waitq_unlink_all_locked(&wqset->wqset_q, &set_links_id, &s, NULL);
3974
3975 waitq_set_unlock(wqset);
3976 if (do_spl)
3977 splx(s);
3978
3979 /*
3980 * walk_setid_links may race with us for access to the waitq set.
3981 * If walk_setid_links has a reference to the set, then we should wait
3982 * until the link's refcount goes to 1 (our reference) before we exit
3983 * this function. That way we ensure that the waitq set memory will
3984 * remain valid even though it's been cleared out.
3985 */
3986 while (sl_refcnt(link) > 1)
3987 delay(1);
3988 lt_put_link(link);
3989
3990 /*
3991 * release all the set link objects
3992 * (links to other sets to which this set was previously added)
3993 */
3994 if (set_links_id)
3995 (void)walk_setid_links(LINK_WALK_ONE_LEVEL, NULL, set_links_id,
3996 SLT_ALL, NULL, waitq_unlink_all_cb);
3997
3998 /* drop / unlink all the prepost table objects */
3999 (void)wq_prepost_iterate(prepost_id, NULL, wqset_clear_prepost_chain_cb);
4000 }
4001
4002 /**
4003 * de-initialize and free an allocated waitq set object
4004 *
4005 * Conditions:
4006 * may block
4007 */
4008 kern_return_t waitq_set_free(struct waitq_set *wqset)
4009 {
4010 waitq_set_deinit(wqset);
4011
4012 memset(wqset, 0, sizeof(*wqset));
4013 zfree(waitq_set_zone, wqset);
4014
4015 return KERN_SUCCESS;
4016 }
4017
4018 #if defined(DEVLEOPMENT) || defined(DEBUG)
4019 #if CONFIG_WAITQ_DEBUG
4020 /**
4021 * return the set ID of 'wqset'
4022 */
4023 uint64_t wqset_id(struct waitq_set *wqset)
4024 {
4025 if (!wqset)
4026 return 0;
4027
4028 assert(waitqs_is_set(wqset));
4029 return wqset->wqset_id;
4030 }
4031
4032 /**
4033 * returns a pointer to the waitq object embedded in 'wqset'
4034 */
4035 struct waitq *wqset_waitq(struct waitq_set *wqset)
4036 {
4037 if (!wqset)
4038 return NULL;
4039
4040 assert(waitqs_is_set(wqset));
4041
4042 return &wqset->wqset_q;
4043 }
4044 #endif /* CONFIG_WAITQ_DEBUG */
4045 #endif /* DEVELOPMENT || DEBUG */
4046
4047
4048 /**
4049 * clear all preposts originating from 'waitq'
4050 *
4051 * Conditions:
4052 * 'waitq' locked
4053 * may (rarely) spin waiting for another on-core thread to
4054 * release the last reference to the waitq's prepost link object
4055 *
4056 * NOTE:
4057 * If this function needs to spin, it will drop the waitq lock!
4058 * The return value of the function indicates whether or not this
4059 * happened: 1 == lock was dropped, 0 == lock held
4060 */
4061 int waitq_clear_prepost_locked(struct waitq *waitq, spl_t *s)
4062 {
4063 struct wq_prepost *wqp;
4064 int dropped_lock = 0;
4065
4066 if (waitq->waitq_prepost_id == 0)
4067 return 0;
4068
4069 wqp = wq_prepost_get(waitq->waitq_prepost_id);
4070 waitq->waitq_prepost_id = 0;
4071 if (wqp) {
4072 uint64_t wqp_id = wqp->wqp_prepostid.id;
4073 wqdbg_v("invalidate prepost 0x%llx (refcnt:%d)",
4074 wqp->wqp_prepostid.id, wqp_refcnt(wqp));
4075 wq_prepost_invalidate(wqp);
4076 while (wqp_refcnt(wqp) > 1) {
4077 int do_spl = waitq_irq_safe(waitq);
4078
4079 /*
4080 * Some other thread must have raced us to grab a link
4081 * object reference before we invalidated it. This
4082 * means that they are probably trying to access the
4083 * waitq to which the prepost object points. We need
4084 * to wait here until the other thread drops their
4085 * reference. We know that no one else can get a
4086 * reference (the object has been invalidated), and
4087 * that prepost references are short-lived (dropped on
4088 * a call to wq_prepost_put). We also know that no one
4089 * blocks while holding a reference therefore the
4090 * other reference holder must be on-core. We'll just
4091 * sit and wait for the other reference to be dropped.
4092 */
4093 disable_preemption();
4094
4095 waitq_unlock(waitq);
4096 if (s && do_spl)
4097 splx(*s);
4098 dropped_lock = 1;
4099 /*
4100 * don't yield here, just spin and assume the other
4101 * consumer is already on core...
4102 */
4103 delay(1);
4104 if (s && do_spl)
4105 *s = splsched();
4106 waitq_lock(waitq);
4107
4108 enable_preemption();
4109 }
4110 if (wqp_refcnt(wqp) > 0 && wqp->wqp_prepostid.id == wqp_id)
4111 wq_prepost_put(wqp);
4112 }
4113
4114 return dropped_lock;
4115 }
4116
4117 /**
4118 * clear all preposts originating from 'waitq'
4119 *
4120 * Conditions:
4121 * 'waitq' is not locked
4122 * may disable and re-enable interrupts
4123 */
4124 void waitq_clear_prepost(struct waitq *waitq)
4125 {
4126 spl_t s;
4127 int do_spl = waitq_irq_safe(waitq);
4128
4129 assert(waitq_valid(waitq));
4130
4131 if (do_spl)
4132 s = splsched();
4133 waitq_lock(waitq);
4134 /* it doesn't matter to us if the lock is dropped here */
4135 (void)waitq_clear_prepost_locked(waitq, &s);
4136 waitq_unlock(waitq);
4137 if (do_spl)
4138 splx(s);
4139 }
4140
4141 /**
4142 * return a the waitq's prepost object ID (allocate if necessary)
4143 *
4144 * Conditions:
4145 * 'waitq' is unlocked
4146 */
4147 uint64_t waitq_get_prepost_id(struct waitq *waitq)
4148 {
4149 struct wq_prepost *wqp;
4150 uint64_t wqp_id = 0;
4151 spl_t s;
4152
4153 if (!waitq_valid(waitq))
4154 return 0;
4155
4156 if (waitq_irq_safe(waitq))
4157 s = splsched();
4158 waitq_lock(waitq);
4159
4160 if (!waitq_valid(waitq))
4161 goto out_unlock;
4162
4163 if (waitq->waitq_prepost_id) {
4164 wqp_id = waitq->waitq_prepost_id;
4165 goto out_unlock;
4166 }
4167
4168 /* don't hold a spinlock while allocating a prepost object */
4169 waitq_unlock(waitq);
4170 if (waitq_irq_safe(waitq))
4171 splx(s);
4172
4173 wqp = wq_prepost_alloc(WQP_WQ, 1);
4174 if (!wqp)
4175 return 0;
4176
4177 /* re-acquire the waitq lock */
4178 if (waitq_irq_safe(waitq))
4179 s = splsched();
4180 waitq_lock(waitq);
4181
4182 if (!waitq_valid(waitq)) {
4183 wq_prepost_put(wqp);
4184 wqp_id = 0;
4185 goto out_unlock;
4186 }
4187
4188 if (waitq->waitq_prepost_id) {
4189 /* we were beat by someone else */
4190 wq_prepost_put(wqp);
4191 wqp_id = waitq->waitq_prepost_id;
4192 goto out_unlock;
4193 }
4194
4195 wqp->wqp_wq.wqp_wq_ptr = waitq;
4196
4197 wqp_set_valid(wqp);
4198 wqp_id = wqp->wqp_prepostid.id;
4199 waitq->waitq_prepost_id = wqp_id;
4200
4201 wq_prepost_put(wqp);
4202
4203 out_unlock:
4204 waitq_unlock(waitq);
4205 if (waitq_irq_safe(waitq))
4206 splx(s);
4207
4208 return wqp_id;
4209 }
4210
4211
4212 static int waitq_inset_cb(struct waitq *waitq, void *ctx, struct setid_link *link)
4213 {
4214 uint64_t setid = *(uint64_t *)ctx;
4215 int ltype = sl_type(link);
4216 (void)waitq;
4217 if (ltype == SLT_WQS && link->sl_set_id.id == setid) {
4218 wqdbg_v(" waitq already in set 0x%llx", setid);
4219 return WQ_ITERATE_FOUND;
4220 } else if (ltype == SLT_LINK) {
4221 /*
4222 * break out early if we see a link that points to the setid
4223 * in question. This saves us a step in the
4224 * iteration/recursion
4225 */
4226 wqdbg_v(" waitq already in set 0x%llx (SLT_LINK)", setid);
4227 if (link->sl_link.sl_left_setid == setid ||
4228 link->sl_link.sl_right_setid == setid)
4229 return WQ_ITERATE_FOUND;
4230 }
4231
4232 return WQ_ITERATE_CONTINUE;
4233 }
4234
4235 /**
4236 * determine if 'waitq' is a member of 'wqset'
4237 *
4238 * Conditions:
4239 * neither 'waitq' nor 'wqset' is not locked
4240 * may disable and re-enable interrupts while locking 'waitq'
4241 */
4242 boolean_t waitq_member(struct waitq *waitq, struct waitq_set *wqset)
4243 {
4244 kern_return_t kr = WQ_ITERATE_SUCCESS;
4245 uint64_t setid;
4246 spl_t s;
4247
4248 if (!waitq_valid(waitq))
4249 panic("Invalid waitq: %p", waitq);
4250
4251 if (!waitqs_is_set(wqset))
4252 return FALSE;
4253
4254 if (waitq_irq_safe(waitq))
4255 s = splsched();
4256 waitq_lock(waitq);
4257
4258 setid = wqset->wqset_id;
4259 if (!setid)
4260 goto out_unlock;
4261
4262 /* fast path: most waitqs are members of only 1 set */
4263 if (waitq->waitq_set_id == setid) {
4264 waitq_unlock(waitq);
4265 if (waitq_irq_safe(waitq))
4266 splx(s);
4267 return TRUE;
4268 }
4269
4270 /* walk the link table and look for the Set ID of wqset */
4271 kr = walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
4272 SLT_ALL, (void *)&setid, waitq_inset_cb);
4273
4274 out_unlock:
4275 waitq_unlock(waitq);
4276 if (waitq_irq_safe(waitq))
4277 splx(s);
4278
4279 if (kr == WQ_ITERATE_FOUND)
4280 return TRUE;
4281 return FALSE;
4282 }
4283
4284 /**
4285 * Returns true is the given waitq is a member of at least 1 set
4286 */
4287 boolean_t waitq_in_set(struct waitq *waitq)
4288 {
4289 struct setid_link *link;
4290 boolean_t inset = FALSE;
4291 spl_t s;
4292
4293 if (waitq_irq_safe(waitq))
4294 s = splsched();
4295 waitq_lock(waitq);
4296
4297 if (!waitq->waitq_set_id)
4298 goto out_unlock;
4299
4300 link = lt_get_link(waitq->waitq_set_id);
4301 if (link) {
4302 /* if we get here, the waitq is in _at_least_one_ set */
4303 inset = TRUE;
4304 lt_put_link(link);
4305 } else {
4306 /* we can just optimize this for next time */
4307 waitq->waitq_set_id = 0;
4308 }
4309
4310 out_unlock:
4311 waitq_unlock(waitq);
4312 if (waitq_irq_safe(waitq))
4313 splx(s);
4314 return inset;
4315 }
4316
4317
4318 /**
4319 * pre-allocate a waitq link structure from the link table
4320 *
4321 * Conditions:
4322 * 'waitq' is not locked
4323 * may (rarely) block if link table needs to grow
4324 */
4325 uint64_t waitq_link_reserve(struct waitq *waitq)
4326 {
4327 struct setid_link *link;
4328 uint64_t reserved_id = 0;
4329
4330 assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));
4331
4332 /*
4333 * We've asserted that the caller can block, so we enforce a
4334 * minimum-free table element policy here.
4335 */
4336 lt_ensure_free_space();
4337
4338 (void)waitq;
4339 link = lt_alloc_link(WQT_RESERVED);
4340 if (!link)
4341 return 0;
4342
4343 reserved_id = link->sl_set_id.id;
4344
4345 return reserved_id;
4346 }
4347
4348 /**
4349 * release a pre-allocated waitq link structure
4350 */
4351 void waitq_link_release(uint64_t id)
4352 {
4353 struct setid_link *link;
4354
4355 if (id == 0)
4356 return;
4357
4358 link = lt_get_reserved(id, SLT_LINK);
4359 if (!link)
4360 return;
4361
4362 /*
4363 * if we successfully got a link object, then we know
4364 * it's not been marked valid, and can be released with
4365 * a standard lt_put_link() which should free the element.
4366 */
4367 lt_put_link(link);
4368 #if CONFIG_WAITQ_STATS
4369 g_linktable.nreserved_releases += 1;
4370 #endif
4371 }
4372
4373 /**
4374 * link 'waitq' to the set identified by 'setid' using the 'link' structure
4375 *
4376 * Conditions:
4377 * 'waitq' is locked
4378 * caller should have a reference to the 'link' object
4379 */
4380 static kern_return_t waitq_link_internal(struct waitq *waitq,
4381 uint64_t setid, struct setid_link *link)
4382 {
4383 struct setid_link *qlink;
4384 kern_return_t kr;
4385
4386 assert(waitq_held(waitq));
4387
4388 /*
4389 * If the waitq_set_id field is empty, then this waitq is not
4390 * a member of any other set. All we have to do is update the
4391 * field.
4392 */
4393 if (!waitq->waitq_set_id) {
4394 waitq->waitq_set_id = setid;
4395 return KERN_SUCCESS;
4396 }
4397
4398 qlink = lt_get_link(waitq->waitq_set_id);
4399 if (!qlink) {
4400 /*
4401 * The set to which this wait queue belonged has been
4402 * destroyed / invalidated. We can re-use the waitq field.
4403 */
4404 waitq->waitq_set_id = setid;
4405 return KERN_SUCCESS;
4406 }
4407 lt_put_link(qlink);
4408
4409 /*
4410 * Check to see if it's already a member of the set.
4411 *
4412 * TODO: check for cycles!
4413 */
4414 kr = walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
4415 SLT_ALL, (void *)&setid, waitq_inset_cb);
4416 if (kr == WQ_ITERATE_FOUND)
4417 return kr;
4418
4419 /*
4420 * This wait queue is a member of at least one set already,
4421 * and _not_ a member of the given set. Use our previously
4422 * allocated link object, and hook it up to the wait queue.
4423 * Note that it's possible that one or more of the wait queue sets to
4424 * which the wait queue belongs was invalidated before we allocated
4425 * this link object. That's OK because the next time we use that
4426 * object we'll just ignore it.
4427 */
4428 link->sl_link.sl_left_setid = setid;
4429 link->sl_link.sl_right_setid = waitq->waitq_set_id;
4430 sl_set_valid(link);
4431
4432 waitq->waitq_set_id = link->sl_set_id.id;
4433
4434 return KERN_SUCCESS;
4435 }
4436
4437 /**
4438 * link 'waitq' to 'wqset'
4439 *
4440 * Conditions:
4441 * if 'lock_state' contains WAITQ_SHOULD_LOCK, 'waitq' must be unlocked.
4442 * Otherwise, 'waitq' must be locked.
4443 *
4444 * may (rarely) block on link table allocation if the table has to grow,
4445 * and no 'reserved_link' object is passed.
4446 *
4447 * Notes:
4448 * The caller can guarantee that this function will never block by
4449 * pre-allocating a link table object and passing its ID in 'reserved_link'
4450 */
4451 kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset,
4452 waitq_lock_state_t lock_state, uint64_t *reserved_link)
4453 {
4454 kern_return_t kr;
4455 struct setid_link *link;
4456 int should_lock = (lock_state == WAITQ_SHOULD_LOCK);
4457 spl_t s;
4458
4459 if (!waitq_valid(waitq))
4460 panic("Invalid waitq: %p", waitq);
4461
4462 if (!waitqs_is_set(wqset))
4463 return KERN_INVALID_ARGUMENT;
4464
4465 wqdbg_v("Link waitq %p to wqset 0x%llx",
4466 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
4467
4468 if (waitq_irq_safe(waitq) && (!reserved_link || *reserved_link == 0)) {
4469 /*
4470 * wait queues that need IRQs disabled cannot block waiting
4471 * for table growth to complete. Even though this is rare,
4472 * we require all these waitqs to pass in a reserved link
4473 * object to avoid the potential to block.
4474 */
4475 panic("Global/IRQ-safe waitq %p cannot link to %p without"
4476 "reserved object!", waitq, wqset);
4477 }
4478
4479 /*
4480 * We _might_ need a new link object here, so we'll grab outside
4481 * the lock because the alloc call _might_ block.
4482 *
4483 * If the caller reserved a link beforehand, then lt_get_link
4484 * is guaranteed not to block because the caller holds an extra
4485 * reference to the link which, in turn, hold a reference to the
4486 * link table.
4487 */
4488 if (reserved_link && *reserved_link != 0) {
4489 link = lt_get_reserved(*reserved_link, SLT_LINK);
4490 /* always consume the caller's reference */
4491 *reserved_link = 0;
4492 } else {
4493 link = lt_alloc_link(SLT_LINK);
4494 }
4495 if (!link)
4496 return KERN_NO_SPACE;
4497
4498 if (should_lock) {
4499 if (waitq_irq_safe(waitq))
4500 s = splsched();
4501 waitq_lock(waitq);
4502 }
4503
4504 kr = waitq_link_internal(waitq, wqset->wqset_id, link);
4505
4506 if (should_lock) {
4507 waitq_unlock(waitq);
4508 if (waitq_irq_safe(waitq))
4509 splx(s);
4510 }
4511
4512 lt_put_link(link);
4513
4514 return kr;
4515 }
4516
4517 /**
4518 * helper: unlink 'waitq' from waitq set identified by 'setid'
4519 * this function also prunes invalid objects from the tree
4520 *
4521 * Conditions:
4522 * MUST be called from walk_setid_links link table walk
4523 * 'waitq' is locked
4524 *
4525 * Notes:
4526 * This is a helper function which compresses the link table by culling
4527 * unused or unnecessary links. See comments below for different
4528 * scenarios.
4529 */
4530 static inline int waitq_maybe_remove_link(struct waitq *waitq,
4531 uint64_t setid,
4532 struct setid_link *parent,
4533 struct setid_link *left,
4534 struct setid_link *right)
4535 {
4536 uint64_t *wq_setid = &waitq->waitq_set_id;
4537
4538 /*
4539 * There are two scenarios:
4540 *
4541 * Scenario 1:
4542 * --------------------------------------------------------------------
4543 * waitq->waitq_set_id == parent
4544 *
4545 * parent(LINK)
4546 * / \
4547 * / \
4548 * / \
4549 * L(LINK/WQS_l) R(LINK/WQS_r)
4550 *
4551 * In this scenario, we assert that the original waitq points to the
4552 * parent link we were passed in. If WQS_l (or WQS_r) is the waitq
4553 * set we're looking for, we can set the corresponding parent
4554 * link id (left or right) to 0. To compress the tree, we can reset the
4555 * waitq_set_id of the original waitq to point to the side of the
4556 * parent that is still valid. We then discard the parent link object.
4557 */
4558 if (*wq_setid == parent->sl_set_id.id) {
4559 if (!left && !right) {
4560 /* completely invalid children */
4561 lt_invalidate(parent);
4562 wqdbg_v("S1, L+R");
4563 *wq_setid = 0;
4564 return WQ_ITERATE_INVALID;
4565 } else if (!left || left->sl_set_id.id == setid) {
4566 /*
4567 * left side matches we know it points either to the
4568 * WQS we're unlinking, or to an invalid object:
4569 * no need to invalidate it
4570 */
4571 *wq_setid = right ? right->sl_set_id.id : 0;
4572 lt_invalidate(parent);
4573 wqdbg_v("S1, L");
4574 return left ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4575 } else if (!right || right->sl_set_id.id == setid) {
4576 /*
4577 * if right side matches we know it points either to the
4578 * WQS we're unlinking, or to an invalid object:
4579 * no need to invalidate it
4580 */
4581 *wq_setid = left ? left->sl_set_id.id : 0;
4582 lt_invalidate(parent);
4583 wqdbg_v("S1, R");
4584 return right ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4585 }
4586 }
4587
4588 /*
4589 * the tree walk starts at the top-of-tree and moves down,
4590 * so these are safe asserts.
4591 */
4592 assert(left || right); /* one of them has to be valid at this point */
4593
4594 /*
4595 * Scenario 2:
4596 * --------------------------------------------------------------------
4597 * waitq->waitq_set_id == ... (OR parent)
4598 *
4599 * ...
4600 * |
4601 * parent
4602 * / \
4603 * / \
4604 * L(LINK) R(LINK)
4605 * /\ /\
4606 * / \ / \
4607 * / \ Rl(*) Rr(*)
4608 * Ll(WQS) Lr(WQS)
4609 *
4610 * In this scenario, a leaf node of either the left or right side
4611 * could be the wait queue set we're looking to unlink. We also handle
4612 * the case where one of these links is invalid. If a leaf node is
4613 * invalid or it's the set we're looking for, we can safely remove the
4614 * middle link (left or right) and point the parent link directly to
4615 * the remaining leaf node.
4616 */
4617 if (left && sl_type(left) == SLT_LINK) {
4618 uint64_t Ll, Lr;
4619 struct setid_link *linkLl, *linkLr;
4620 assert(left->sl_set_id.id != setid);
4621 Ll = left->sl_link.sl_left_setid;
4622 Lr = left->sl_link.sl_right_setid;
4623 linkLl = lt_get_link(Ll);
4624 linkLr = lt_get_link(Lr);
4625 if (!linkLl && !linkLr) {
4626 /*
4627 * The left object points to two invalid objects!
4628 * We can invalidate the left w/o touching the parent.
4629 */
4630 lt_invalidate(left);
4631 wqdbg_v("S2, Ll+Lr");
4632 return WQ_ITERATE_INVALID;
4633 } else if (!linkLl || Ll == setid) {
4634 /* Ll is invalid and/or the wait queue set we're looking for */
4635 parent->sl_link.sl_left_setid = Lr;
4636 lt_invalidate(left);
4637 lt_put_link(linkLl);
4638 lt_put_link(linkLr);
4639 wqdbg_v("S2, Ll");
4640 return linkLl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4641 } else if (!linkLr || Lr == setid) {
4642 /* Lr is invalid and/or the wait queue set we're looking for */
4643 parent->sl_link.sl_left_setid = Ll;
4644 lt_invalidate(left);
4645 lt_put_link(linkLr);
4646 lt_put_link(linkLl);
4647 wqdbg_v("S2, Lr");
4648 return linkLr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4649 }
4650 lt_put_link(linkLl);
4651 lt_put_link(linkLr);
4652 }
4653
4654 if (right && sl_type(right) == SLT_LINK) {
4655 uint64_t Rl, Rr;
4656 struct setid_link *linkRl, *linkRr;
4657 assert(right->sl_set_id.id != setid);
4658 Rl = right->sl_link.sl_left_setid;
4659 Rr = right->sl_link.sl_right_setid;
4660 linkRl = lt_get_link(Rl);
4661 linkRr = lt_get_link(Rr);
4662 if (!linkRl && !linkRr) {
4663 /*
4664 * The right object points to two invalid objects!
4665 * We can invalidate the right w/o touching the parent.
4666 */
4667 lt_invalidate(right);
4668 wqdbg_v("S2, Rl+Rr");
4669 return WQ_ITERATE_INVALID;
4670 } else if (!linkRl || Rl == setid) {
4671 /* Rl is invalid and/or the wait queue set we're looking for */
4672 parent->sl_link.sl_right_setid = Rr;
4673 lt_invalidate(right);
4674 lt_put_link(linkRl);
4675 lt_put_link(linkRr);
4676 wqdbg_v("S2, Rl");
4677 return linkRl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4678 } else if (!linkRr || Rr == setid) {
4679 /* Rr is invalid and/or the wait queue set we're looking for */
4680 parent->sl_link.sl_right_setid = Rl;
4681 lt_invalidate(right);
4682 lt_put_link(linkRl);
4683 lt_put_link(linkRr);
4684 wqdbg_v("S2, Rr");
4685 return linkRr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4686 }
4687 lt_put_link(linkRl);
4688 lt_put_link(linkRr);
4689 }
4690
4691 return WQ_ITERATE_CONTINUE;
4692 }
4693
4694 /**
4695 * link table walk callback that unlinks 'waitq' from 'ctx->setid'
4696 *
4697 * Conditions:
4698 * called from walk_setid_links
4699 * 'waitq' is locked
4700 *
4701 * Notes:
4702 * uses waitq_maybe_remove_link() to compress the linktable and
4703 * perform the actual unlinking
4704 */
4705 static int waitq_unlink_cb(struct waitq *waitq, void *ctx,
4706 struct setid_link *link)
4707 {
4708 uint64_t setid = *((uint64_t *)ctx);
4709 struct setid_link *right, *left;
4710 int ret = 0;
4711
4712 if (sl_type(link) != SLT_LINK)
4713 return WQ_ITERATE_CONTINUE;
4714
4715 do {
4716 left = lt_get_link(link->sl_link.sl_left_setid);
4717 right = lt_get_link(link->sl_link.sl_right_setid);
4718
4719 ret = waitq_maybe_remove_link(waitq, setid, link, left, right);
4720
4721 lt_put_link(left);
4722 lt_put_link(right);
4723
4724 if (!sl_is_valid(link))
4725 return WQ_ITERATE_INVALID;
4726 /* A ret value of UNLINKED will break us out of table walk */
4727 } while (ret == WQ_ITERATE_INVALID);
4728
4729 return ret;
4730 }
4731
4732
4733 /**
4734 * undo/remove a prepost from 'ctx' (waitq) to 'wqset'
4735 *
4736 * Conditions:
4737 * Called from wq_prepost_foreach_locked OR wq_prepost_iterate
4738 * 'wqset' may be NULL
4739 * (ctx)->unlink_wqset is locked
4740 */
4741 static int waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx,
4742 struct wq_prepost *wqp, struct waitq *waitq)
4743 {
4744 struct wq_unlink_ctx *ulctx = (struct wq_unlink_ctx *)ctx;
4745
4746 if (waitq != ulctx->unlink_wq)
4747 return WQ_ITERATE_CONTINUE;
4748
4749 if (wqp_type(wqp) == WQP_WQ &&
4750 wqp->wqp_prepostid.id == ulctx->unlink_wqset->wqset_prepost_id) {
4751 /* this is the only prepost on this wait queue set */
4752 wqdbg_v("unlink wqp (WQ) 0x%llx", wqp->wqp_prepostid.id);
4753 ulctx->unlink_wqset->wqset_prepost_id = 0;
4754 return WQ_ITERATE_BREAK;
4755 }
4756
4757 assert(wqp_type(wqp) == WQP_POST);
4758
4759 /*
4760 * The prepost object 'wqp' points to a waitq which should no longer
4761 * be preposted to 'ulctx->unlink_wqset'. We can remove the prepost
4762 * object from the list and break out of the iteration. Using the
4763 * context object in this way allows this same callback function to be
4764 * used from both wq_prepost_foreach_locked and wq_prepost_iterate.
4765 */
4766 wq_prepost_remove(ulctx->unlink_wqset, wqp);
4767 return WQ_ITERATE_BREAK;
4768 }
4769
4770 /**
4771 * unlink 'waitq' from 'wqset'
4772 *
4773 * Conditions:
4774 * 'waitq' is locked
4775 * 'wqset' is _not_ locked
4776 * may (rarely) spin in prepost clear and drop/re-acquire 'waitq' lock
4777 * (see waitq_clear_prepost_locked)
4778 */
4779 static kern_return_t waitq_unlink_locked(struct waitq *waitq,
4780 struct waitq_set *wqset,
4781 spl_t *s)
4782 {
4783 uint64_t setid;
4784 kern_return_t kr;
4785
4786 setid = wqset->wqset_id;
4787
4788 if (waitq->waitq_set_id == 0) {
4789 /*
4790 * TODO:
4791 * it doesn't belong to anyone, and it has a prepost object?
4792 * This is an artifact of not cleaning up after kqueues when
4793 * they prepost into select sets...
4794 */
4795 if (waitq->waitq_prepost_id != 0)
4796 (void)waitq_clear_prepost_locked(waitq, s);
4797 return KERN_NOT_IN_SET;
4798 }
4799
4800 if (waitq->waitq_set_id == setid) {
4801 waitq->waitq_set_id = 0;
4802 /*
4803 * This was the only set to which the waitq belonged: we can
4804 * safely release the waitq's prepost object. It doesn't
4805 * matter if this function drops and re-acquires the lock
4806 * because we're not manipulating waitq state any more.
4807 */
4808 (void)waitq_clear_prepost_locked(waitq, s);
4809 return KERN_SUCCESS;
4810 }
4811
4812 /*
4813 * The waitq was a member of more that 1 set, so we need to
4814 * handle potentially compressing the link table, and
4815 * adjusting the waitq->waitq_set_id value.
4816 *
4817 * Note: we can't free the waitq's associated prepost object (if any)
4818 * because it may be in use by the one or more _other_ sets to
4819 * which this queue belongs.
4820 *
4821 * Note: This function only handles a single level of the queue linkage.
4822 * Removing a waitq from a set to which it does not directly
4823 * belong is undefined. For example, if a waitq belonged to set
4824 * A, and set A belonged to set B. You can't remove the waitq
4825 * from set B.
4826 */
4827 kr = walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
4828 SLT_LINK, (void *)&setid, waitq_unlink_cb);
4829
4830 if (kr == WQ_ITERATE_UNLINKED) {
4831 struct wq_unlink_ctx ulctx;
4832 int do_spl = 0;
4833
4834 kr = KERN_SUCCESS; /* found it and dis-associated it */
4835
4836 if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q)) {
4837 *s = splsched();
4838 do_spl = 1;
4839 }
4840 waitq_set_lock(wqset);
4841 /*
4842 * clear out any prepost from waitq into wqset
4843 * TODO: this could be more efficient than a linear search of
4844 * the waitq set's prepost list.
4845 */
4846 ulctx.unlink_wq = waitq;
4847 ulctx.unlink_wqset = wqset;
4848 (void)wq_prepost_iterate(wqset->wqset_prepost_id, (void *)&ulctx,
4849 waitq_unlink_prepost_cb);
4850 waitq_set_unlock(wqset);
4851 if (do_spl)
4852 splx(*s);
4853 } else {
4854 kr = KERN_NOT_IN_SET; /* waitq is _not_ associated with wqset */
4855 }
4856
4857 return kr;
4858 }
4859
4860 /**
4861 * unlink 'waitq' from 'wqset'
4862 *
4863 * Conditions:
4864 * neither 'waitq' nor 'wqset' is locked
4865 * may disable and re-enable interrupts
4866 * may (rarely) spin in prepost clear
4867 * (see waitq_clear_prepost_locked)
4868 */
4869 kern_return_t waitq_unlink(struct waitq *waitq, struct waitq_set *wqset)
4870 {
4871 kern_return_t kr = KERN_SUCCESS;
4872 spl_t s;
4873
4874 assert(waitqs_is_set(wqset));
4875
4876 /*
4877 * we allow the waitq to be invalid because the caller may be trying
4878 * to clear out old/dirty state
4879 */
4880 if (!waitq_valid(waitq))
4881 return KERN_INVALID_ARGUMENT;
4882
4883 wqdbg_v("unlink waitq %p from set 0x%llx",
4884 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
4885
4886 if (waitq_irq_safe(waitq))
4887 s = splsched();
4888 waitq_lock(waitq);
4889
4890 kr = waitq_unlink_locked(waitq, wqset, &s);
4891
4892 waitq_unlock(waitq);
4893 if (waitq_irq_safe(waitq))
4894 splx(s);
4895
4896 return kr;
4897 }
4898
4899 /**
4900 * unlink a waitq from a waitq set, but reference the waitq by its prepost ID
4901 *
4902 * Conditions:
4903 * 'wqset' is unlocked
4904 * wqp_id may be valid or invalid
4905 */
4906 void waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset)
4907 {
4908 struct wq_prepost *wqp;
4909
4910 disable_preemption();
4911 wqp = wq_prepost_get(wqp_id);
4912 if (wqp) {
4913 struct waitq *wq;
4914 spl_t s;
4915
4916 wq = wqp->wqp_wq.wqp_wq_ptr;
4917
4918 /*
4919 * lock the waitq, then release our prepost ID reference, then
4920 * unlink the waitq from the wqset: this ensures that we don't
4921 * hold a prepost ID reference during the unlink, but we also
4922 * complete the unlink operation atomically to avoid a race
4923 * with waitq_unlink[_all].
4924 */
4925 if (waitq_irq_safe(wq))
4926 s = splsched();
4927 waitq_lock(wq);
4928 wq_prepost_put(wqp);
4929
4930 if (!waitq_valid(wq)) {
4931 /* someone already tore down this waitq! */
4932 waitq_unlock(wq);
4933 if (waitq_irq_safe(wq))
4934 splx(s);
4935 enable_preemption();
4936 return;
4937 }
4938
4939 /* this _may_ drop the wq lock, but that's OK */
4940 waitq_unlink_locked(wq, wqset, &s);
4941
4942 waitq_unlock(wq);
4943 if (waitq_irq_safe(wq))
4944 splx(s);
4945 }
4946 enable_preemption();
4947 return;
4948 }
4949
4950
4951 /**
4952 * unlink 'waitq' from all sets to which it belongs
4953 *
4954 * Conditions:
4955 * 'waitq' is locked
4956 *
4957 * Notes:
4958 * may drop and re-acquire the waitq lock
4959 * may (rarely) spin (see waitq_clear_prepost_locked)
4960 */
4961 kern_return_t waitq_unlink_all_locked(struct waitq *waitq, uint64_t *old_set_id,
4962 spl_t *s, int *dropped_lock)
4963 {
4964 wqdbg_v("unlink waitq %p from all sets",
4965 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq));
4966
4967 *old_set_id = 0;
4968
4969 /* it's not a member of any sets */
4970 if (waitq->waitq_set_id == 0)
4971 return KERN_SUCCESS;
4972
4973 *old_set_id = waitq->waitq_set_id;
4974 waitq->waitq_set_id = 0;
4975
4976 /*
4977 * invalidate the prepost entry for this waitq.
4978 * This may drop and re-acquire the waitq lock, but that's OK because
4979 * if it was added to another set and preposted to that set in the
4980 * time we drop the lock, the state will remain consistent.
4981 */
4982 int dropped = waitq_clear_prepost_locked(waitq, s);
4983 if (dropped_lock)
4984 *dropped_lock = dropped;
4985
4986 return KERN_SUCCESS;
4987 }
4988
4989 /**
4990 * unlink 'waitq' from all sets to which it belongs
4991 *
4992 * Conditions:
4993 * 'waitq' is not locked
4994 * may disable and re-enable interrupts
4995 * may (rarely) spin
4996 * (see waitq_unlink_all_locked, waitq_clear_prepost_locked)
4997 */
4998 kern_return_t waitq_unlink_all(struct waitq *waitq)
4999 {
5000 kern_return_t kr = KERN_SUCCESS;
5001 uint64_t setid = 0;
5002 spl_t s;
5003
5004 if (!waitq_valid(waitq))
5005 panic("Invalid waitq: %p", waitq);
5006
5007 if (waitq_irq_safe(waitq))
5008 s = splsched();
5009 waitq_lock(waitq);
5010 if (waitq_valid(waitq))
5011 kr = waitq_unlink_all_locked(waitq, &setid, &s, NULL);
5012 waitq_unlock(waitq);
5013 if (waitq_irq_safe(waitq))
5014 splx(s);
5015
5016 if (setid) {
5017 /*
5018 * Walk the link table and invalidate each LINK object that
5019 * used to connect this waitq to one or more sets: this works
5020 * because SLT_LINK objects are private to each wait queue
5021 */
5022 (void)walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, setid,
5023 SLT_LINK, NULL, waitq_unlink_all_cb);
5024 }
5025
5026 return kr;
5027 }
5028
5029
5030 /**
5031 * unlink all waitqs from 'wqset'
5032 *
5033 * Conditions:
5034 * 'wqset' is not locked
5035 * may (rarely) spin/block (see waitq_clear_prepost_locked)
5036 */
5037 kern_return_t waitq_set_unlink_all(struct waitq_set *wqset)
5038 {
5039 struct setid_link *link;
5040 uint64_t prepost_id, set_links_id = 0;
5041 spl_t spl;
5042
5043 assert(waitqs_is_set(wqset));
5044
5045 wqdbg_v("unlink all queues from set 0x%llx", wqset->wqset_id);
5046
5047 /*
5048 * This operation does not require interaction with any of the set's
5049 * constituent wait queues. All we have to do is invalidate the SetID
5050 */
5051 if (waitq_irq_safe(&wqset->wqset_q))
5052 spl = splsched();
5053 waitq_set_lock(wqset);
5054
5055 /* invalidate and re-alloc the link object first */
5056 link = lt_get_link(wqset->wqset_id);
5057
5058 /* we may have raced with a waitq_set_deinit: handle this */
5059 if (!link) {
5060 waitq_set_unlock(wqset);
5061 return KERN_SUCCESS;
5062 }
5063
5064 lt_invalidate(link);
5065
5066 /* re-alloc the object to get a new generation ID */
5067 lt_realloc_link(link, SLT_WQS);
5068 link->sl_wqs.sl_set = wqset;
5069
5070 wqset->wqset_id = link->sl_set_id.id;
5071 sl_set_valid(link);
5072 lt_put_link(link);
5073
5074 /* clear any preposts attached to this set */
5075 prepost_id = wqset->wqset_prepost_id;
5076 wqset->wqset_prepost_id = 0;
5077
5078 /*
5079 * clear set linkage and prepost object associated with this set:
5080 * waitq sets may prepost to other sets if, for example, they are
5081 * associated with a kqueue which is in a select set.
5082 *
5083 * This may drop and re-acquire the set lock, but that's OK because
5084 * the resulting state will remain consistent.
5085 */
5086 waitq_unlink_all_locked(&wqset->wqset_q, &set_links_id, &spl, NULL);
5087
5088 waitq_set_unlock(wqset);
5089 if (waitq_irq_safe(&wqset->wqset_q))
5090 splx(spl);
5091
5092 /*
5093 * release all the set link objects
5094 * (links to other sets to which this set was previously added)
5095 */
5096 if (set_links_id)
5097 (void)walk_setid_links(LINK_WALK_ONE_LEVEL, &wqset->wqset_q,
5098 set_links_id, SLT_LINK, NULL,
5099 waitq_unlink_all_cb);
5100
5101 /* drop / unlink all the prepost table objects */
5102 if (prepost_id)
5103 (void)wq_prepost_iterate(prepost_id, NULL,
5104 wqset_clear_prepost_chain_cb);
5105
5106 return KERN_SUCCESS;
5107 }
5108
5109
5110 static int waitq_prepost_reserve_cb(struct waitq *waitq, void *ctx,
5111 struct setid_link *link)
5112 {
5113 uint32_t *num = (uint32_t *)ctx;
5114 (void)waitq;
5115
5116 /*
5117 * In the worst case, we'll have to allocate 2 prepost objects
5118 * per waitq set (if the set was already preposted by another
5119 * waitq).
5120 */
5121 if (sl_type(link) == SLT_WQS) {
5122 /*
5123 * check to see if the associated waitq actually supports
5124 * preposting
5125 */
5126 if (waitq_set_can_prepost(link->sl_wqs.sl_set))
5127 *num += 2;
5128 }
5129 return WQ_ITERATE_CONTINUE;
5130 }
5131
5132 static int waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq,
5133 spl_t *s, int *did_unlock,
5134 struct wq_prepost **wqp)
5135 {
5136 struct wq_prepost *tmp;
5137 struct wqp_cache *cache;
5138
5139 *did_unlock = 0;
5140
5141 /*
5142 * Before we unlock the waitq, check the per-processor prepost object
5143 * cache to see if there's enough there for us. If so, do the
5144 * allocation, keep the lock and save an entire iteration over the set
5145 * linkage!
5146 */
5147 if (waitq) {
5148 disable_preemption();
5149 cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
5150 if (nalloc <= (int)cache->avail)
5151 goto do_alloc;
5152 enable_preemption();
5153
5154 /* unlock the waitq to perform the allocation */
5155 *did_unlock = 1;
5156 waitq_unlock(waitq);
5157 if (waitq_irq_safe(waitq))
5158 splx(*s);
5159 }
5160
5161 do_alloc:
5162 tmp = wq_prepost_alloc(WQT_RESERVED, nalloc);
5163 if (!tmp)
5164 panic("Couldn't reserve %d preposts for waitq @%p (wqp@%p)",
5165 nalloc, waitq, *wqp);
5166 if (*wqp) {
5167 /* link the two lists */
5168 int __assert_only rc;
5169 rc = wq_prepost_rlink(tmp, *wqp);
5170 assert(rc == nalloc);
5171 }
5172 *wqp = tmp;
5173
5174 /*
5175 * If the caller can block, then enforce a minimum-free table element
5176 * policy here. This helps ensure that we will have enough prepost
5177 * objects for callers such as selwakeup() that can be called with
5178 * spin locks held.
5179 */
5180 if (get_preemption_level() == 0)
5181 wq_prepost_ensure_free_space();
5182
5183 if (waitq) {
5184 if (*did_unlock == 0) {
5185 /* decrement the preemption count if alloc from cache */
5186 enable_preemption();
5187 } else {
5188 /* otherwise: re-lock the waitq */
5189 if (waitq_irq_safe(waitq))
5190 *s = splsched();
5191 waitq_lock(waitq);
5192 }
5193 }
5194
5195 return nalloc;
5196 }
5197
5198 static int waitq_count_prepost_reservation(struct waitq *waitq, int extra, int keep_locked)
5199 {
5200 int npreposts = 0;
5201
5202 /*
5203 * If the waitq is not currently part of a set, and we're not asked to
5204 * keep the waitq locked then we'll want to have 3 in reserve
5205 * just-in-case it becomes part of a set while we unlock and reserve.
5206 * We may need up to 1 object for the waitq, and 2 for the set.
5207 */
5208 if (waitq->waitq_set_id == 0) {
5209 npreposts = 3;
5210 } else {
5211 /* this queue has never been preposted before */
5212 if (waitq->waitq_prepost_id == 0)
5213 npreposts = 3;
5214
5215 /*
5216 * Walk the set of table linkages associated with this waitq
5217 * and count the worst-case number of prepost objects that
5218 * may be needed during a wakeup_all. We can walk this without
5219 * locking each set along the way because the table-based IDs
5220 * disconnect us from the set pointers themselves, and the
5221 * table walking is careful to read the setid values only once.
5222 * Locking each set up the chain also doesn't guarantee that
5223 * their membership won't change between the time we unlock
5224 * that set and when we actually go to prepost, so our
5225 * situation is no worse than before and we've alleviated lock
5226 * contention on any sets to which this waitq belongs.
5227 */
5228 (void)walk_setid_links(LINK_WALK_FULL_DAG_UNLOCKED,
5229 waitq, waitq->waitq_set_id,
5230 SLT_WQS, (void *)&npreposts,
5231 waitq_prepost_reserve_cb);
5232 }
5233
5234 if (extra > 0)
5235 npreposts += extra;
5236
5237 if (npreposts == 0 && !keep_locked) {
5238 /*
5239 * If we get here, we were asked to reserve some prepost
5240 * objects for a waitq that's previously preposted, and is not
5241 * currently a member of any sets. We have also been
5242 * instructed to unlock the waitq when we're done. In this
5243 * case, we pre-allocated enough reserved objects to handle
5244 * the case where the waitq gets added to a single set when
5245 * the lock is released.
5246 */
5247 npreposts = 3;
5248 }
5249
5250 return npreposts;
5251 }
5252
5253
5254 /**
5255 * pre-allocate prepost objects for 'waitq'
5256 *
5257 * Conditions:
5258 * 'waitq' is not locked
5259 *
5260 * Returns:
5261 * panic on error
5262 *
5263 * 0 on success, '*reserved' is set to the head of a singly-linked
5264 * list of pre-allocated prepost objects.
5265 *
5266 * Notes:
5267 * If 'lock_state' is WAITQ_KEEP_LOCKED, this function performs the pre-allocation
5268 * atomically and returns 'waitq' locked. If the waitq requires
5269 * interrupts to be disabled, then the output parameter 's' is set to the
5270 * previous interrupt state (from splsched), and the caller is
5271 * responsible to call splx().
5272 *
5273 * This function attempts to pre-allocate precisely enough prepost
5274 * objects based on the current set membership of 'waitq'. If the
5275 * operation is performed atomically, then the caller
5276 * is guaranteed to have enough pre-allocated prepost object to avoid
5277 * any (rare) blocking in the wakeup path.
5278 */
5279 uint64_t waitq_prepost_reserve(struct waitq *waitq, int extra,
5280 waitq_lock_state_t lock_state, spl_t *s)
5281 {
5282 uint64_t reserved = 0;
5283 uint64_t prev_setid = 0, prev_prepostid = 0;
5284 struct wq_prepost *wqp = NULL;
5285 int nalloc = 0, npreposts = 0;
5286 int keep_locked = (lock_state == WAITQ_KEEP_LOCKED);
5287 int unlocked = 0;
5288
5289 if (s)
5290 *s = 0;
5291
5292 wqdbg_v("Attempting to reserve prepost linkages for waitq %p (extra:%d)",
5293 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), extra);
5294
5295 if (waitq == NULL && extra > 0) {
5296 /*
5297 * Simple prepost object allocation:
5298 * we'll add 2 more because the waitq might need an object,
5299 * and the set itself may need a new POST object in addition
5300 * to the number of preposts requested by the caller
5301 */
5302 nalloc = waitq_alloc_prepost_reservation(extra + 2, NULL, NULL,
5303 &unlocked, &wqp);
5304 assert(nalloc == extra + 2);
5305 return wqp->wqp_prepostid.id;
5306 }
5307
5308 assert(lock_state == WAITQ_KEEP_LOCKED || lock_state == WAITQ_UNLOCK);
5309
5310 if (waitq_irq_safe(waitq))
5311 *s = splsched();
5312 waitq_lock(waitq);
5313
5314 /* global queues are never part of any sets */
5315 if (waitq_is_global(waitq)) {
5316 if (keep_locked)
5317 goto out;
5318 goto out_unlock;
5319 }
5320
5321 /* remember the set ID that we started with */
5322 prev_setid = waitq->waitq_set_id;
5323 prev_prepostid = waitq->waitq_prepost_id;
5324
5325 /*
5326 * If the waitq is not part of a set, and we're asked to
5327 * keep the set locked, then we don't have to reserve
5328 * anything!
5329 */
5330 if (prev_setid == 0 && keep_locked)
5331 goto out;
5332
5333 npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
5334
5335 /* nothing for us to do! */
5336 if (npreposts == 0) {
5337 if (keep_locked)
5338 goto out;
5339 goto out_unlock;
5340 }
5341
5342 try_alloc:
5343 /* this _may_ unlock and relock the waitq! */
5344 nalloc = waitq_alloc_prepost_reservation(npreposts, waitq, s,
5345 &unlocked, &wqp);
5346
5347 if (!unlocked) {
5348 /* allocation held the waitq lock: we'd done! */
5349 if (keep_locked)
5350 goto out;
5351 goto out_unlock;
5352 }
5353
5354 /*
5355 * Before we return, if the allocation had to unlock the waitq, we
5356 * must check one more time to see if we have enough. If not, we'll
5357 * try to allocate the difference. If the caller requests it, we'll
5358 * also leave the waitq locked so that the use of the pre-allocated
5359 * prepost objects can be guaranteed to be enough if a wakeup_all is
5360 * performed before unlocking the waitq.
5361 */
5362
5363 /*
5364 * If the waitq is no longer associated with a set, or if the waitq's
5365 * set/prepostid has not changed since we first walked its linkage,
5366 * we're done.
5367 */
5368 if ((waitq->waitq_set_id == 0) ||
5369 (waitq->waitq_set_id == prev_setid &&
5370 waitq->waitq_prepost_id == prev_prepostid)) {
5371 if (keep_locked)
5372 goto out;
5373 goto out_unlock;
5374 }
5375
5376 npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
5377
5378 if (npreposts > nalloc) {
5379 prev_setid = waitq->waitq_set_id;
5380 prev_prepostid = waitq->waitq_prepost_id;
5381 npreposts = npreposts - nalloc; /* only allocate the diff */
5382 goto try_alloc;
5383 }
5384
5385 if (keep_locked)
5386 goto out;
5387
5388 out_unlock:
5389 waitq_unlock(waitq);
5390 if (waitq_irq_safe(waitq))
5391 splx(*s);
5392 out:
5393 if (wqp)
5394 reserved = wqp->wqp_prepostid.id;
5395
5396 return reserved;
5397 }
5398
5399 /**
5400 * release a linked list of prepost objects allocated via _prepost_reserve
5401 *
5402 * Conditions:
5403 * may (rarely) spin waiting for prepost table growth memcpy
5404 */
5405 void waitq_prepost_release_reserve(uint64_t id)
5406 {
5407 struct wq_prepost *wqp;
5408
5409 wqdbg_v("releasing reserved preposts starting at: 0x%llx", id);
5410
5411 wqp = wq_prepost_rfirst(id);
5412 if (!wqp)
5413 return;
5414
5415 wq_prepost_release_rlist(wqp);
5416 }
5417
5418
5419 /**
5420 * clear all preposts from 'wqset'
5421 *
5422 * Conditions:
5423 * 'wqset' is not locked
5424 */
5425 void waitq_set_clear_preposts(struct waitq_set *wqset)
5426 {
5427 uint64_t prepost_id;
5428 spl_t spl;
5429
5430 assert(waitqs_is_set(wqset));
5431
5432 wqdbg_v("Clearing all preposted queues on waitq_set: 0x%llx",
5433 wqset->wqset_id);
5434
5435 if (waitq_irq_safe(&wqset->wqset_q))
5436 spl = splsched();
5437 waitq_set_lock(wqset);
5438 prepost_id = wqset->wqset_prepost_id;
5439 wqset->wqset_prepost_id = 0;
5440 waitq_set_unlock(wqset);
5441 if (waitq_irq_safe(&wqset->wqset_q))
5442 splx(spl);
5443
5444 /* drop / unlink all the prepost table objects */
5445 if (prepost_id)
5446 (void)wq_prepost_iterate(prepost_id, NULL,
5447 wqset_clear_prepost_chain_cb);
5448 }
5449
5450
5451 /* ----------------------------------------------------------------------
5452 *
5453 * Iteration: waitq -> sets / waitq_set -> preposts
5454 *
5455 * ---------------------------------------------------------------------- */
5456
5457 struct wq_it_ctx {
5458 void *input;
5459 void *ctx;
5460 waitq_iterator_t it;
5461
5462 spl_t *spl;
5463 };
5464
5465 static int waitq_iterate_sets_cb(struct waitq *waitq, void *ctx,
5466 struct setid_link *link)
5467 {
5468 struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx);
5469 struct waitq_set *wqset;
5470 int ret;
5471 spl_t spl;
5472
5473 (void)waitq;
5474 assert(sl_type(link) == SLT_WQS);
5475
5476 /*
5477 * the waitq is locked, so we can just take the set lock
5478 * and call the iterator function
5479 */
5480 wqset = link->sl_wqs.sl_set;
5481 assert(wqset != NULL);
5482
5483 if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q))
5484 spl = splsched();
5485 waitq_set_lock(wqset);
5486
5487 ret = wctx->it(wctx->ctx, (struct waitq *)wctx->input, wqset);
5488
5489 waitq_set_unlock(wqset);
5490 if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q))
5491 splx(spl);
5492
5493 return ret;
5494 }
5495
5496 /**
5497 * call external iterator function for each prepost object in wqset
5498 *
5499 * Conditions:
5500 * Called from wq_prepost_foreach_locked
5501 * (wqset locked, waitq _not_ locked)
5502 */
5503 static int wqset_iterate_prepost_cb(struct waitq_set *wqset, void *ctx,
5504 struct wq_prepost *wqp, struct waitq *waitq)
5505 {
5506 struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx);
5507 uint64_t wqp_id;
5508 int ret;
5509 spl_t s;
5510
5511 (void)wqp;
5512
5513 /*
5514 * This is a bit tricky. The 'wqset' is locked, but the 'waitq' is not.
5515 * Taking the 'waitq' lock is a lock order violation, so we need to be
5516 * careful. We also must realize that we may have taken a reference to
5517 * the 'wqp' just as the associated waitq was being torn down (or
5518 * clearing all its preposts) - see waitq_clear_prepost_locked(). If
5519 * the 'wqp' is valid and we can get the waitq lock, then we are good
5520 * to go. If not, we need to back off, check that the 'wqp' hasn't
5521 * been invalidated, and try to re-take the locks.
5522 */
5523 if (waitq_irq_safe(waitq))
5524 s = splsched();
5525 if (waitq_lock_try(waitq))
5526 goto call_iterator;
5527
5528 if (waitq_irq_safe(waitq))
5529 splx(s);
5530
5531 if (!wqp_is_valid(wqp))
5532 return WQ_ITERATE_RESTART;
5533
5534 /* We are passed a prepost object with a reference on it. If neither
5535 * the waitq set nor the waitq require interrupts disabled, then we
5536 * may block on the delay(1) call below. We can't hold a prepost
5537 * object reference while blocking, so we have to give that up as well
5538 * and re-acquire it when we come back.
5539 */
5540 wqp_id = wqp->wqp_prepostid.id;
5541 wq_prepost_put(wqp);
5542 waitq_set_unlock(wqset);
5543 wqdbg_v("dropped set:%p lock waiting for wqp:%p (0x%llx -> wq:%p)",
5544 wqset, wqp, wqp->wqp_prepostid.id, waitq);
5545 delay(1);
5546 waitq_set_lock(wqset);
5547 wqp = wq_prepost_get(wqp_id);
5548 if (!wqp)
5549 /* someone cleared preposts while we slept! */
5550 return WQ_ITERATE_DROPPED;
5551
5552 /*
5553 * TODO:
5554 * This differs slightly from the logic in ipc_mqueue.c:
5555 * ipc_mqueue_receive_on_thread(). There, if the waitq lock
5556 * can't be obtained, the prepost link is placed on the back of
5557 * the chain, and the iteration starts from the beginning. Here,
5558 * we just restart from the beginning.
5559 */
5560 return WQ_ITERATE_RESTART;
5561
5562 call_iterator:
5563 if (!wqp_is_valid(wqp)) {
5564 ret = WQ_ITERATE_RESTART;
5565 goto out_unlock;
5566 }
5567
5568 /* call the external callback */
5569 ret = wctx->it(wctx->ctx, waitq, wqset);
5570
5571 if (ret == WQ_ITERATE_BREAK_KEEP_LOCKED) {
5572 ret = WQ_ITERATE_BREAK;
5573 if (wctx->spl)
5574 *(wctx->spl) = s;
5575 goto out;
5576 }
5577
5578 out_unlock:
5579 waitq_unlock(waitq);
5580 if (waitq_irq_safe(waitq))
5581 splx(s);
5582
5583 out:
5584 return ret;
5585 }
5586
5587 /**
5588 * iterator over all sets to which the given waitq has been linked
5589 *
5590 * Conditions:
5591 * 'waitq' is locked
5592 */
5593 int waitq_iterate_sets(struct waitq *waitq, void *ctx, waitq_iterator_t it)
5594 {
5595 int ret;
5596 struct wq_it_ctx wctx = {
5597 .input = (void *)waitq,
5598 .ctx = ctx,
5599 .it = it,
5600 };
5601 if (!it || !waitq)
5602 return KERN_INVALID_ARGUMENT;
5603
5604 ret = walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
5605 SLT_WQS, (void *)&wctx, waitq_iterate_sets_cb);
5606 if (ret == WQ_ITERATE_CONTINUE)
5607 ret = WQ_ITERATE_SUCCESS;
5608 return ret;
5609 }
5610
5611 /**
5612 * iterator over all preposts in the given wqset
5613 *
5614 * Conditions:
5615 * 'wqset' is locked
5616 */
5617 int waitq_set_iterate_preposts(struct waitq_set *wqset,
5618 void *ctx, waitq_iterator_t it, spl_t *s)
5619 {
5620 struct wq_it_ctx wctx = {
5621 .input = (void *)wqset,
5622 .ctx = ctx,
5623 .it = it,
5624 .spl = s,
5625 };
5626 if (!it || !wqset)
5627 return WQ_ITERATE_INVALID;
5628
5629 assert(waitq_held(&wqset->wqset_q));
5630
5631 return wq_prepost_foreach_locked(wqset, (void *)&wctx,
5632 wqset_iterate_prepost_cb);
5633 }
5634
5635
5636 /* ----------------------------------------------------------------------
5637 *
5638 * Higher-level APIs
5639 *
5640 * ---------------------------------------------------------------------- */
5641
5642 /**
5643 * declare a thread's intent to wait on 'waitq' for 'wait_event'
5644 *
5645 * Conditions:
5646 * 'waitq' is not locked
5647 * will disable and re-enable interrupts while locking current_thread()
5648 */
5649 wait_result_t waitq_assert_wait64(struct waitq *waitq,
5650 event64_t wait_event,
5651 wait_interrupt_t interruptible,
5652 uint64_t deadline)
5653 {
5654 wait_result_t ret;
5655 thread_t thread = current_thread();
5656 spl_t s;
5657
5658 if (!waitq_valid(waitq))
5659 panic("Invalid waitq: %p", waitq);
5660
5661 if (waitq_irq_safe(waitq))
5662 s = splsched();
5663 waitq_lock(waitq);
5664
5665 if (!waitq_irq_safe(waitq))
5666 s = splsched();
5667 thread_lock(thread);
5668
5669 ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
5670 TIMEOUT_URGENCY_SYS_NORMAL,
5671 deadline, TIMEOUT_NO_LEEWAY, thread);
5672
5673 thread_unlock(thread);
5674 waitq_unlock(waitq);
5675
5676 splx(s);
5677
5678 return ret;
5679 }
5680
5681 /**
5682 * declare a thread's intent to wait on 'waitq' for 'wait_event'
5683 *
5684 * Conditions:
5685 * 'waitq' is not locked
5686 * will disable and re-enable interrupts while locking current_thread()
5687 */
5688 wait_result_t waitq_assert_wait64_leeway(struct waitq *waitq,
5689 event64_t wait_event,
5690 wait_interrupt_t interruptible,
5691 wait_timeout_urgency_t urgency,
5692 uint64_t deadline,
5693 uint64_t leeway)
5694 {
5695 wait_result_t ret;
5696 thread_t thread = current_thread();
5697 spl_t s;
5698
5699 if (!waitq_valid(waitq))
5700 panic("Invalid waitq: %p", waitq);
5701
5702 if (waitq_irq_safe(waitq))
5703 s = splsched();
5704 waitq_lock(waitq);
5705
5706 if (!waitq_irq_safe(waitq))
5707 s = splsched();
5708 thread_lock(thread);
5709
5710 ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
5711 urgency, deadline, leeway, thread);
5712
5713 thread_unlock(thread);
5714 waitq_unlock(waitq);
5715
5716 splx(s);
5717
5718 return ret;
5719 }
5720
5721 /**
5722 * wakeup a single thread from a waitq that's waiting for a given event
5723 *
5724 * Conditions:
5725 * 'waitq' is not locked
5726 * may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
5727 * may disable and re-enable interrupts
5728 *
5729 * Notes:
5730 * will _not_ block if waitq is global (or not a member of any set)
5731 */
5732 kern_return_t waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,
5733 wait_result_t result, int priority)
5734 {
5735 kern_return_t kr;
5736 uint64_t reserved_preposts = 0;
5737 spl_t spl;
5738
5739 if (!waitq_valid(waitq))
5740 panic("Invalid waitq: %p", waitq);
5741
5742 /* NOTE: this will _not_ reserve anything if waitq is global */
5743 reserved_preposts = waitq_prepost_reserve(waitq, 0,
5744 WAITQ_KEEP_LOCKED, &spl);
5745
5746 /* waitq is locked upon return */
5747 kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
5748 &reserved_preposts, priority, WAITQ_UNLOCK);
5749
5750 if (waitq_irq_safe(waitq))
5751 splx(spl);
5752
5753 /* release any left-over prepost object (won't block/lock anything) */
5754 waitq_prepost_release_reserve(reserved_preposts);
5755
5756 return kr;
5757 }
5758
5759 /**
5760 * wakeup all threads from a waitq that are waiting for a given event
5761 *
5762 * Conditions:
5763 * 'waitq' is not locked
5764 * may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
5765 * may disable and re-enable interrupts
5766 *
5767 * Notes:
5768 * will _not_ block if waitq is global (or not a member of any set)
5769 */
5770 kern_return_t waitq_wakeup64_all(struct waitq *waitq,
5771 event64_t wake_event,
5772 wait_result_t result,
5773 int priority)
5774 {
5775 kern_return_t ret;
5776 uint64_t reserved_preposts = 0;
5777 spl_t s;
5778
5779 if (!waitq_valid(waitq))
5780 panic("Invalid waitq: %p", waitq);
5781
5782 /* keep waitq locked upon return */
5783 /* NOTE: this will _not_ reserve anything if waitq is global */
5784 reserved_preposts = waitq_prepost_reserve(waitq, 0,
5785 WAITQ_KEEP_LOCKED, &s);
5786
5787 /* waitq is locked */
5788
5789 ret = waitq_wakeup64_all_locked(waitq, wake_event, result,
5790 &reserved_preposts, priority,
5791 WAITQ_UNLOCK);
5792
5793 if (waitq_irq_safe(waitq))
5794 splx(s);
5795
5796 waitq_prepost_release_reserve(reserved_preposts);
5797
5798 return ret;
5799
5800 }
5801
5802 /**
5803 * wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
5804 *
5805 * Conditions:
5806 * 'waitq' is not locked
5807 *
5808 * Notes:
5809 * May temporarily disable and re-enable interrupts
5810 */
5811 kern_return_t waitq_wakeup64_thread(struct waitq *waitq,
5812 event64_t wake_event,
5813 thread_t thread,
5814 wait_result_t result)
5815 {
5816 kern_return_t ret;
5817 spl_t s, th_spl;
5818
5819 if (!waitq_valid(waitq))
5820 panic("Invalid waitq: %p", waitq);
5821
5822 if (waitq_irq_safe(waitq))
5823 s = splsched();
5824 waitq_lock(waitq);
5825
5826 ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
5827 /* on success, returns 'thread' locked */
5828
5829 waitq_unlock(waitq);
5830
5831 if (ret == KERN_SUCCESS) {
5832 ret = thread_go(thread, result);
5833 assert(ret == KERN_SUCCESS);
5834 thread_unlock(thread);
5835 splx(th_spl);
5836 waitq_stats_count_wakeup(waitq);
5837 } else {
5838 ret = KERN_NOT_WAITING;
5839 waitq_stats_count_fail(waitq);
5840 }
5841
5842 if (waitq_irq_safe(waitq))
5843 splx(s);
5844
5845 return ret;
5846 }