osfmk/kern/waitq.c

   1 /*
   2  * Copyright (c) 2015-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_FREE_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 /*
  58  * un-comment the following lines to debug the link/prepost tables
  59  * NOTE: this expands each element by ~40 bytes
  60  */
  61 //#define KEEP_WAITQ_LINK_STATS
  62 //#define KEEP_WAITQ_PREPOST_STATS
  63
  64 #include <kern/ast.h>
  65 #include <kern/backtrace.h>
  66 #include <kern/kern_types.h>
  67 #include <kern/ltable.h>
  68 #include <kern/mach_param.h>
  69 #include <kern/queue.h>
  70 #include <kern/sched_prim.h>
  71 #include <kern/simple_lock.h>
  72 #include <kern/spl.h>
  73 #include <kern/waitq.h>
  74 #include <kern/zalloc.h>
  75 #include <kern/policy_internal.h>
  76
  77 #include <libkern/OSAtomic.h>
  78 #include <mach/sync_policy.h>
  79 #include <vm/vm_kern.h>
  80
  81 #include <sys/kdebug.h>
  82
  83 #if defined(KEEP_WAITQ_LINK_STATS) || defined(KEEP_WAITQ_PREPOST_STATS)
  84 #  if !CONFIG_LTABLE_STATS
  85 #    error "You must configure LTABLE_STATS to use WAITQ_[LINK|PREPOST]_STATS"
  86 #  endif
  87 #  if !CONFIG_WAITQ_STATS
  88 #    error "You must configure WAITQ_STATS to use WAITQ_[LINK|PREPOST]_STATS"
  89 #  endif
  90 #endif
  91
  92 #if CONFIG_WAITQ_DEBUG
  93 #define wqdbg(fmt,...) \
  94         printf("WQ[%s]:  " fmt "\n", __func__, ## __VA_ARGS__)
  95 #else
  96 #define wqdbg(fmt,...) do { } while (0)
  97 #endif
  98
  99 #ifdef WAITQ_VERBOSE_DEBUG
 100 #define wqdbg_v(fmt,...) \
 101         printf("WQ[v:%s]:  " fmt "\n", __func__, ## __VA_ARGS__)
 102 #else
 103 #define wqdbg_v(fmt,...) do { } while (0)
 104 #endif
 105
 106 #define wqinfo(fmt,...) \
 107         printf("WQ[%s]: " fmt "\n", __func__,  ## __VA_ARGS__)
 108
 109 #define wqerr(fmt,...) \
 110         printf("WQ[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__)
 111
 112 /*
 113  * file-static functions / data
 114  */
 115 static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,
 116                                         uint64_t *reserved_preposts,
 117                                         int priority, spl_t *spl);
 118
 119 static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
 120                                                 event64_t event,
 121                                                 thread_t thread, spl_t *spl);
 122
 123 #define WAITQ_SET_MAX (task_max * 3)
 124 static zone_t waitq_set_zone;
 125
 126
 127 #define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
 128 #define ROUNDDOWN(x,y)  (((x)/(y))*(y))
 129
 130
 131 #if CONFIG_LTABLE_STATS || CONFIG_WAITQ_STATS
 132 static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
 133 #endif
 134
 135 #if __arm64__
 136
 137 #define waitq_lock_to(wq,to) \
 138         (hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, (uint32_t)to))
 139
 140 #define waitq_lock_unlock(wq) \
 141         (hw_unlock_bit(&(wq)->waitq_interlock, LCK_ILOCK))
 142
 143 #define waitq_lock_init(wq) \
 144         (wq->waitq_interlock = 0)
 145
 146 #else
 147
 148 #define waitq_lock_to(wq,to) \
 149         (hw_lock_to(&(wq)->waitq_interlock, (uint32_t)to))
 150
 151 #define waitq_lock_unlock(wq) \
 152         (hw_lock_unlock(&(wq)->waitq_interlock))
 153
 154 #define waitq_lock_init(wq) \
 155         (hw_lock_init(&(wq)->waitq_interlock))
 156
 157 #endif  /* __arm64__ */
 158
 159 /*
 160  * Prepost callback function for specially marked waitq sets
 161  * (prepost alternative)
 162  */
 163 extern void waitq_set__CALLING_PREPOST_HOOK__(void *ctx, void *memberctx, int priority);
 164
 165 #define DEFAULT_MIN_FREE_TABLE_ELEM    100
 166 static uint32_t g_min_free_table_elem;
 167 static uint32_t g_min_free_cache;
 168
 169
 170 /* ----------------------------------------------------------------------
 171  *
 172  * SetID Link Table Implementation
 173  *
 174  * ---------------------------------------------------------------------- */
 175 static struct link_table g_wqlinktable;
 176
 177 enum wq_link_type {
 178         WQL_ALL     = -1,
 179         WQL_FREE    = LT_FREE,
 180         WQL_WQS     = LT_ELEM,
 181         WQL_LINK    = LT_LINK,
 182 };
 183
 184 struct waitq_link {
 185         struct lt_elem wqte;
 186
 187         union {
 188                 /* wqt_type == WQL_WQS (LT_ELEM) */
 189                 struct {
 190                         struct waitq_set *wql_set;
 191                         /* uint64_t          sl_prepost_id; */
 192                 } wql_wqs;
 193
 194                 /* wqt_type == WQL_LINK (LT_LINK) */
 195                 struct {
 196                         uint64_t          left_setid;
 197                         uint64_t          right_setid;
 198                 } wql_link;
 199         };
 200 #ifdef KEEP_WAITQ_LINK_STATS
 201         thread_t  sl_alloc_th;
 202         task_t    sl_alloc_task;
 203         uintptr_t sl_alloc_bt[NWAITQ_BTFRAMES];
 204         uint64_t  sl_alloc_ts;
 205         uintptr_t sl_invalidate_bt[NWAITQ_BTFRAMES];
 206         uint64_t  sl_invalidate_ts;
 207         uintptr_t sl_mkvalid_bt[NWAITQ_BTFRAMES];
 208         uint64_t  sl_mkvalid_ts;
 209         uint64_t  sl_free_ts;
 210 #endif
 211 };
 212 #if !defined(KEEP_WAITQ_LINK_STATS)
 213 static_assert((sizeof(struct waitq_link) & (sizeof(struct waitq_link) - 1)) == 0,
 214                "waitq_link struct must be a power of two!");
 215 #endif
 216
 217 #define wql_refcnt(link) \
 218         (lt_bits_refcnt((link)->wqte.lt_bits))
 219
 220 #define wql_type(link) \
 221         (lt_bits_type((link)->wqte.lt_bits))
 222
 223 #define wql_mkvalid(link) \
 224         do { \
 225                 lt_elem_mkvalid(&(link)->wqte); \
 226                 wql_do_mkvalid_stats(&(link)->wqte); \
 227         } while (0)
 228
 229 #define wql_is_valid(link) \
 230         lt_bits_valid((link)->wqte.lt_bits)
 231
 232 #define wql_setid wqte.lt_id
 233
 234 #define WQL_WQS_POISON         ((void *)(0xf00df00d))
 235 #define WQL_LINK_POISON        (0x0bad0badffffffffull)
 236
 237 static void wql_poison(struct link_table *table, struct lt_elem *elem)
 238 {
 239         struct waitq_link *link = (struct waitq_link *)elem;
 240         (void)table;
 241
 242         switch (wql_type(link)) {
 243         case WQL_WQS:
 244                 link->wql_wqs.wql_set = WQL_WQS_POISON;
 245                 break;
 246         case WQL_LINK:
 247                 link->wql_link.left_setid = WQL_LINK_POISON;
 248                 link->wql_link.right_setid = WQL_LINK_POISON;
 249                 break;
 250         default:
 251                 break;
 252         }
 253 #ifdef KEEP_WAITQ_LINK_STATS
 254         memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
 255         link->sl_alloc_ts = 0;
 256         memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));
 257         link->sl_mkvalid_ts = 0;
 258
 259         link->sl_alloc_th = THREAD_NULL;
 260         /* leave the sl_alloc_task in place for debugging */
 261
 262         link->sl_free_ts = mach_absolute_time();
 263 #endif
 264 }
 265
 266 #ifdef KEEP_WAITQ_LINK_STATS
 267 static __inline__ void wql_do_alloc_stats(struct lt_elem *elem)
 268 {
 269         if (elem) {
 270                 struct waitq_link *link = (struct waitq_link *)elem;
 271                 memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
 272                 waitq_grab_backtrace(link->sl_alloc_bt, 0);
 273                 link->sl_alloc_th = current_thread();
 274                 link->sl_alloc_task = current_task();
 275
 276                 assert(link->sl_alloc_ts == 0);
 277                 link->sl_alloc_ts = mach_absolute_time();
 278
 279                 memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
 280                 link->sl_invalidate_ts = 0;
 281         }
 282 }
 283
 284 static __inline__ void wql_do_invalidate_stats(struct lt_elem *elem)
 285 {
 286         struct waitq_link *link = (struct waitq_link *)elem;
 287
 288         if (!elem)
 289                 return;
 290
 291         assert(link->sl_mkvalid_ts > 0);
 292
 293         memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
 294         link->sl_invalidate_ts = mach_absolute_time();
 295         waitq_grab_backtrace(link->sl_invalidate_bt, 0);
 296 }
 297
 298 static __inline__ void wql_do_mkvalid_stats(struct lt_elem *elem)
 299 {
 300         struct waitq_link *link = (struct waitq_link *)elem;
 301
 302         if (!elem)
 303                 return;
 304
 305         memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));
 306         link->sl_mkvalid_ts = mach_absolute_time();
 307         waitq_grab_backtrace(link->sl_mkvalid_bt, 0);
 308 }
 309 #else
 310 #define wql_do_alloc_stats(e)
 311 #define wql_do_invalidate_stats(e)
 312 #define wql_do_mkvalid_stats(e)
 313 #endif /* KEEP_WAITQ_LINK_STATS */
 314
 315 static void wql_init(void)
 316 {
 317         uint32_t tablesz = 0, max_links = 0;
 318
 319         if (PE_parse_boot_argn("wql_tsize", &tablesz, sizeof(tablesz)) != TRUE)
 320                 tablesz = (uint32_t)g_lt_max_tbl_size;
 321
 322         tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
 323         max_links = tablesz / sizeof(struct waitq_link);
 324         assert(max_links > 0 && tablesz > 0);
 325
 326         /* we have a restricted index range */
 327         if (max_links > (LT_IDX_MAX + 1))
 328                 max_links = LT_IDX_MAX + 1;
 329
 330         wqinfo("init linktable with max:%d elements (%d bytes)",
 331                max_links, tablesz);
 332         ltable_init(&g_wqlinktable, "wqslab.wql", max_links,
 333                     sizeof(struct waitq_link), wql_poison);
 334 }
 335
 336 static void wql_ensure_free_space(void)
 337 {
 338         if (g_wqlinktable.nelem - g_wqlinktable.used_elem < g_min_free_table_elem) {
 339                 /*
 340                  * we don't hold locks on these values, so check for underflow
 341                  */
 342                 if (g_wqlinktable.used_elem <= g_wqlinktable.nelem) {
 343                         wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d",
 344                                 g_wqlinktable.nelem, g_wqlinktable.used_elem,
 345                                 g_min_free_table_elem);
 346                         ltable_grow(&g_wqlinktable, g_min_free_table_elem);
 347                 }
 348         }
 349 }
 350
 351 static struct waitq_link *wql_alloc_link(int type)
 352 {
 353         struct lt_elem *elem;
 354
 355         elem = ltable_alloc_elem(&g_wqlinktable, type, 1, 0);
 356         wql_do_alloc_stats(elem);
 357         return (struct waitq_link *)elem;
 358 }
 359
 360 static void wql_realloc_link(struct waitq_link *link, int type)
 361 {
 362         ltable_realloc_elem(&g_wqlinktable, &link->wqte, type);
 363 #ifdef KEEP_WAITQ_LINK_STATS
 364         memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
 365         link->sl_alloc_ts = 0;
 366         wql_do_alloc_stats(&link->wqte);
 367
 368         memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
 369         link->sl_invalidate_ts = 0;
 370 #endif
 371 }
 372
 373 static void wql_invalidate(struct waitq_link *link)
 374 {
 375         lt_elem_invalidate(&link->wqte);
 376         wql_do_invalidate_stats(&link->wqte);
 377 }
 378
 379 static struct waitq_link *wql_get_link(uint64_t setid)
 380 {
 381         struct lt_elem *elem;
 382
 383         elem = ltable_get_elem(&g_wqlinktable, setid);
 384         return (struct waitq_link *)elem;
 385 }
 386
 387 static void wql_put_link(struct waitq_link *link)
 388 {
 389         if (!link)
 390                 return;
 391         ltable_put_elem(&g_wqlinktable, (struct lt_elem *)link);
 392 }
 393
 394 static struct waitq_link *wql_get_reserved(uint64_t setid, int type)
 395 {
 396         struct lt_elem *elem;
 397
 398         elem = lt_elem_list_first(&g_wqlinktable, setid);
 399         if (!elem)
 400                 return NULL;
 401         ltable_realloc_elem(&g_wqlinktable, elem, type);
 402         return (struct waitq_link *)elem;
 403 }
 404
 405
 406 static inline int waitq_maybe_remove_link(struct waitq *waitq,
 407                                           uint64_t setid,
 408                                           struct waitq_link *parent,
 409                                           struct waitq_link *left,
 410                                           struct waitq_link *right);
 411
 412 enum {
 413         LINK_WALK_ONE_LEVEL = 0,
 414         LINK_WALK_FULL_DAG  = 1,
 415         LINK_WALK_FULL_DAG_UNLOCKED = 2,
 416 };
 417
 418 typedef int (*wql_callback_func)(struct waitq *waitq, void *ctx,
 419                                  struct waitq_link *link);
 420
 421 /**
 422  * walk_waitq_links: walk all table elements (of type 'link_type') pointed to by 'setid'
 423  *
 424  * Conditions:
 425  *      waitq is locked (or NULL)
 426  *      'setid' is managed by 'waitq'
 427  *              this could be direct (waitq->waitq_set_id == setid)
 428  *              OR indirect (setid is the left/right ID in a LINK chain,
 429  *                           whose root is waitq->waitq_set_id)
 430  *
 431  * Notes:
 432  *      This function uses recursion to walk the set of table elements
 433  *      pointed to by 'setid'. For each element encountered, 'cb' will be
 434  *      called. If non-zero, the return value of this callback function can
 435  *      early-out of the table walk.
 436  *
 437  *      For each link element encountered, the function takes a reference to
 438  *      it. The reference is dropped only after the callback and any recursion
 439  *      has completed.
 440  *
 441  *      The assumed table/link/tree structure:
 442  *                   'setid'
 443  *                   /    \
 444  *                  /      \
 445  *              L(LINK)     R(LINK)
 446  *               /\             /\
 447  *              /  \           /  \
 448  *             /    \       Rl(*)  Rr(*)
 449  *         Ll(*)  Lr(*)      /\    /\
 450  *           /\     /\    ... ... ... ...
 451  *        ...  ... ... ...
 452  *                    \
 453  *                    WQS(wqset_q.waitq_setid == Sx)
 454  *                    [waitq set is a membet of setid, 'Sx')
 455  *
 456  *                    'Sx'
 457  *                   /    \
 458  *                  /      \
 459  *              L(LINK)     R(LINK)
 460  *               /\             /\
 461  *             ... ...        ... ...
 462  *
 463  *      The basic algorithm is as follows:
 464  *      *) take a reference to the table object pointed to by 'setid'
 465  *      *) if appropriate, call 'cb' (potentially early-out on non-zero return)
 466  *      *) if the link object points to a waitq set, and the walk type
 467  *         is 'FULL_DAG' (full directed-acyclic-graph), then try to lock
 468  *         the associated waitq set object and recursively walk all sets to
 469  *         which that set belongs. This is a DFS of the tree structure.
 470  *      *) recurse down the left side of the tree (following the
 471  *         'left_setid' pointer in the link object
 472  *      *) recurse down the right side of the tree (following the
 473  *         'right_setid' pointer in the link object
 474  */
 475 static __attribute__((noinline))
 476 int walk_waitq_links(int walk_type, struct waitq *waitq,
 477                      uint64_t setid, int link_type,
 478                      void *ctx, wql_callback_func cb)
 479 {
 480         struct waitq_link *link;
 481         uint64_t nextid;
 482         int wqltype;
 483
 484         link = wql_get_link(setid);
 485
 486         /* invalid link */
 487         if (!link)
 488                 return WQ_ITERATE_CONTINUE;
 489
 490         setid = nextid = 0;
 491         wqltype = wql_type(link);
 492         if (wqltype == WQL_LINK) {
 493                 setid  = link->wql_link.left_setid;
 494                 nextid = link->wql_link.right_setid;
 495         }
 496
 497         /*
 498          * Make the callback only on specified link_type (or all links)
 499          * Note that after the callback, the link object may be
 500          * invalid. The only valid thing we can do is put our
 501          * reference to it (which may put it back on the free list)
 502          */
 503         if (link_type == WQL_ALL || link_type == wqltype) {
 504                 /* allow the callback to early-out */
 505                 int ret = cb(waitq, ctx, link);
 506                 if (ret != WQ_ITERATE_CONTINUE) {
 507                         wql_put_link(link);
 508                         return ret;
 509                 }
 510         }
 511
 512         if (wqltype == WQL_WQS &&
 513             (walk_type == LINK_WALK_FULL_DAG ||
 514              walk_type == LINK_WALK_FULL_DAG_UNLOCKED)) {
 515                 /*
 516                  * Recurse down any sets to which this wait queue set was
 517                  * added.  We do this just before we put our reference to
 518                  * the link object (which may free it).
 519                  */
 520                 struct waitq_set *wqset = link->wql_wqs.wql_set;
 521                 int ret = WQ_ITERATE_CONTINUE;
 522                 int should_unlock = 0;
 523                 uint64_t wqset_setid = 0;
 524
 525                 if (waitq_set_is_valid(wqset) && walk_type == LINK_WALK_FULL_DAG) {
 526                         assert(!waitq_irq_safe(&wqset->wqset_q));
 527                         waitq_set_lock(wqset);
 528                         should_unlock = 1;
 529                 }
 530
 531                 /*
 532                  * verify the linked waitq set as it could have been
 533                  * invalidated before we grabbed the lock!
 534                  */
 535                 if (wqset->wqset_id != link->wql_setid.id) {
 536                         /*This is the bottom of the tree: just get out */
 537                         if (should_unlock) {
 538                                 waitq_set_unlock(wqset);
 539                         }
 540                         wql_put_link(link);
 541                         return WQ_ITERATE_CONTINUE;
 542                 }
 543
 544                 wqset_setid = wqset->wqset_q.waitq_set_id;
 545
 546                 if (wqset_setid > 0)
 547                         ret = walk_waitq_links(walk_type, &wqset->wqset_q,
 548                                                wqset_setid, link_type, ctx, cb);
 549                 if (should_unlock) {
 550                         waitq_set_unlock(wqset);
 551                 }
 552                 if (ret != WQ_ITERATE_CONTINUE) {
 553                         wql_put_link(link);
 554                         return ret;
 555                 }
 556         }
 557
 558         wql_put_link(link);
 559
 560         /* recurse down left side of the tree */
 561         if (setid) {
 562                 int ret = walk_waitq_links(walk_type, waitq, setid, link_type, ctx, cb);
 563                 if (ret != WQ_ITERATE_CONTINUE)
 564                         return ret;
 565         }
 566
 567         /* recurse down right side of the tree */
 568         if (nextid)
 569                 return walk_waitq_links(walk_type, waitq, nextid, link_type, ctx, cb);
 570
 571         return WQ_ITERATE_CONTINUE;
 572 }
 573
 574 /* ----------------------------------------------------------------------
 575  *
 576  * Prepost Link Table Implementation
 577  *
 578  * ---------------------------------------------------------------------- */
 579 static struct link_table g_prepost_table;
 580
 581 enum wq_prepost_type {
 582         WQP_FREE  = LT_FREE,
 583         WQP_WQ    = LT_ELEM,
 584         WQP_POST  = LT_LINK,
 585 };
 586
 587 struct wq_prepost {
 588         struct lt_elem wqte;
 589
 590         union {
 591                 /* wqt_type == WQP_WQ (LT_ELEM) */
 592                 struct {
 593                         struct waitq *wqp_wq_ptr;
 594                 } wqp_wq;
 595                 /* wqt_type == WQP_POST (LT_LINK) */
 596                 struct {
 597                         uint64_t      wqp_next_id;
 598                         uint64_t      wqp_wq_id;
 599                 } wqp_post;
 600         };
 601 #ifdef KEEP_WAITQ_PREPOST_STATS
 602         thread_t  wqp_alloc_th;
 603         task_t    wqp_alloc_task;
 604         uintptr_t wqp_alloc_bt[NWAITQ_BTFRAMES];
 605 #endif
 606 };
 607 #if !defined(KEEP_WAITQ_PREPOST_STATS)
 608 static_assert((sizeof(struct wq_prepost) & (sizeof(struct wq_prepost) - 1)) == 0,
 609                "wq_prepost struct must be a power of two!");
 610 #endif
 611
 612 #define wqp_refcnt(wqp) \
 613         (lt_bits_refcnt((wqp)->wqte.lt_bits))
 614
 615 #define wqp_type(wqp) \
 616         (lt_bits_type((wqp)->wqte.lt_bits))
 617
 618 #define wqp_set_valid(wqp) \
 619         lt_elem_mkvalid(&(wqp)->wqte)
 620
 621 #define wqp_is_valid(wqp) \
 622         lt_bits_valid((wqp)->wqte.lt_bits)
 623
 624 #define wqp_prepostid wqte.lt_id
 625
 626 #define WQP_WQ_POISON              (0x0bad0badffffffffull)
 627 #define WQP_POST_POISON            (0xf00df00df00df00d)
 628
 629 static void wqp_poison(struct link_table *table, struct lt_elem *elem)
 630 {
 631         struct wq_prepost *wqp = (struct wq_prepost *)elem;
 632         (void)table;
 633
 634         switch (wqp_type(wqp)) {
 635         case WQP_WQ:
 636                 break;
 637         case WQP_POST:
 638                 wqp->wqp_post.wqp_next_id = WQP_POST_POISON;
 639                 wqp->wqp_post.wqp_wq_id = WQP_POST_POISON;
 640                 break;
 641         default:
 642                 break;
 643         }
 644 }
 645
 646 #ifdef KEEP_WAITQ_PREPOST_STATS
 647 static __inline__ void wqp_do_alloc_stats(struct lt_elem *elem)
 648 {
 649         if (!elem)
 650                 return;
 651
 652         struct wq_prepost *wqp = (struct wq_prepost *)elem;
 653         uintptr_t alloc_bt[sizeof(wqp->wqp_alloc_bt)];
 654
 655         waitq_grab_backtrace(alloc_bt, NWAITQ_BTFRAMES);
 656
 657         /* be sure the take stats for _all_ allocated objects */
 658         for (;;) {
 659                 memcpy(wqp->wqp_alloc_bt, alloc_bt, sizeof(alloc_bt));
 660                 wqp->wqp_alloc_th = current_thread();
 661                 wqp->wqp_alloc_task = current_task();
 662                 wqp = (struct wq_prepost *)lt_elem_list_next(&g_prepost_table, &wqp->wqte);
 663                 if (!wqp)
 664                         break;
 665         }
 666 }
 667 #else
 668 #define wqp_do_alloc_stats(e)
 669 #endif /* KEEP_WAITQ_LINK_STATS */
 670
 671 static void wqp_init(void)
 672 {
 673         uint32_t tablesz = 0, max_wqp = 0;
 674
 675         if (PE_parse_boot_argn("wqp_tsize", &tablesz, sizeof(tablesz)) != TRUE)
 676                 tablesz = (uint32_t)g_lt_max_tbl_size;
 677
 678         tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
 679         max_wqp = tablesz / sizeof(struct wq_prepost);
 680         assert(max_wqp > 0 && tablesz > 0);
 681
 682         /* we have a restricted index range */
 683         if (max_wqp > (LT_IDX_MAX + 1))
 684                 max_wqp = LT_IDX_MAX + 1;
 685
 686         wqinfo("init prepost table with max:%d elements (%d bytes)",
 687                max_wqp, tablesz);
 688         ltable_init(&g_prepost_table, "wqslab.prepost", max_wqp,
 689                     sizeof(struct wq_prepost), wqp_poison);
 690 }
 691
 692 /*
 693  * Refill the per-CPU cache.
 694  */
 695 static void wq_prepost_refill_cpu_cache(uint32_t nalloc)
 696 {
 697         struct lt_elem *new_head, *old_head;
 698         struct wqp_cache *cache;
 699
 700         /* require preemption enabled to allocate elements */
 701         if (get_preemption_level() != 0)
 702                 return;
 703
 704         new_head = ltable_alloc_elem(&g_prepost_table,
 705                                      LT_RESERVED, nalloc, 1);
 706         if (new_head == NULL)
 707                 return;
 708
 709         disable_preemption();
 710         cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
 711
 712         /* check once more before putting these elements on the list */
 713         if (cache->avail >= WQP_CACHE_MAX) {
 714                 lt_elem_list_release(&g_prepost_table, new_head, LT_RESERVED);
 715                 enable_preemption();
 716                 return;
 717         }
 718
 719         cache->avail += nalloc;
 720         if (cache->head == 0 || cache->head == LT_IDX_MAX) {
 721                 cache->head = new_head->lt_id.id;
 722                 goto out;
 723         }
 724
 725         old_head = lt_elem_list_first(&g_prepost_table, cache->head);
 726         (void)lt_elem_list_link(&g_prepost_table, new_head, old_head);
 727         cache->head = new_head->lt_id.id;
 728
 729 out:
 730         enable_preemption();
 731         return;
 732 }
 733
 734 static void wq_prepost_ensure_free_space(void)
 735 {
 736         uint32_t free_elem;
 737         uint32_t min_free;
 738         struct wqp_cache *cache;
 739
 740         if (g_min_free_cache == 0)
 741                 g_min_free_cache = (WQP_CACHE_MAX * ml_get_max_cpus());
 742
 743         /*
 744          * Ensure that we always have a pool of per-CPU prepost elements
 745          */
 746         disable_preemption();
 747         cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
 748         free_elem = cache->avail;
 749         enable_preemption();
 750
 751         if (free_elem < (WQP_CACHE_MAX / 3))
 752                 wq_prepost_refill_cpu_cache(WQP_CACHE_MAX - free_elem);
 753
 754         /*
 755          * Now ensure that we have a sufficient amount of free table space
 756          */
 757         free_elem = g_prepost_table.nelem - g_prepost_table.used_elem;
 758         min_free = g_min_free_table_elem + g_min_free_cache;
 759         if (free_elem < min_free) {
 760                 /*
 761                  * we don't hold locks on these values, so check for underflow
 762                  */
 763                 if (g_prepost_table.used_elem <= g_prepost_table.nelem) {
 764                         wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d+%d",
 765                                 g_prepost_table.nelem, g_prepost_table.used_elem,
 766                                 g_min_free_table_elem, g_min_free_cache);
 767                         ltable_grow(&g_prepost_table, min_free);
 768                 }
 769         }
 770 }
 771
 772 static struct wq_prepost *wq_prepost_alloc(int type, int nelem)
 773 {
 774         struct lt_elem *elem;
 775         struct wq_prepost *wqp;
 776         struct wqp_cache *cache;
 777
 778         if (type != LT_RESERVED)
 779                 goto do_alloc;
 780         if (nelem == 0)
 781                 return NULL;
 782
 783         /*
 784          * First try to grab the elements from the per-CPU cache if we are
 785          * allocating RESERVED elements
 786          */
 787         disable_preemption();
 788         cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
 789         if (nelem <= (int)cache->avail) {
 790                 struct lt_elem *first, *next = NULL;
 791                 int nalloc = nelem;
 792
 793                 cache->avail -= nelem;
 794
 795                 /* grab the first element */
 796                 first = lt_elem_list_first(&g_prepost_table, cache->head);
 797
 798                 /* find the last element and re-adjust the cache head */
 799                 for (elem = first; elem != NULL && nalloc > 0; elem = next) {
 800                         next = lt_elem_list_next(&g_prepost_table, elem);
 801                         if (--nalloc == 0) {
 802                                 /* terminate the allocated list */
 803                                 elem->lt_next_idx = LT_IDX_MAX;
 804                                 break;
 805                         }
 806                 }
 807                 assert(nalloc == 0);
 808                 if (!next)
 809                         cache->head = LT_IDX_MAX;
 810                 else
 811                         cache->head = next->lt_id.id;
 812                 /* assert that we don't have mis-matched book keeping */
 813                 assert(!(cache->head == LT_IDX_MAX && cache->avail > 0));
 814                 enable_preemption();
 815                 elem = first;
 816                 goto out;
 817         }
 818         enable_preemption();
 819
 820 do_alloc:
 821         /* fall-back to standard table allocation */
 822         elem = ltable_alloc_elem(&g_prepost_table, type, nelem, 0);
 823         if (!elem)
 824                 return NULL;
 825
 826 out:
 827         wqp = (struct wq_prepost *)elem;
 828         wqp_do_alloc_stats(elem);
 829         return wqp;
 830 }
 831
 832 static void wq_prepost_invalidate(struct wq_prepost *wqp)
 833 {
 834         lt_elem_invalidate(&wqp->wqte);
 835 }
 836
 837 static struct wq_prepost *wq_prepost_get(uint64_t wqp_id)
 838 {
 839         struct lt_elem *elem;
 840
 841         elem = ltable_get_elem(&g_prepost_table, wqp_id);
 842         return (struct wq_prepost *)elem;
 843 }
 844
 845 static void wq_prepost_put(struct wq_prepost *wqp)
 846 {
 847         ltable_put_elem(&g_prepost_table, (struct lt_elem *)wqp);
 848 }
 849
 850 static int wq_prepost_rlink(struct wq_prepost *parent, struct wq_prepost *child)
 851 {
 852         return lt_elem_list_link(&g_prepost_table, &parent->wqte, &child->wqte);
 853 }
 854
 855 static struct wq_prepost *wq_prepost_get_rnext(struct wq_prepost *head)
 856 {
 857         struct lt_elem *elem;
 858         struct wq_prepost *wqp;
 859         uint64_t id;
 860
 861         elem = lt_elem_list_next(&g_prepost_table, &head->wqte);
 862         if (!elem)
 863                 return NULL;
 864         id = elem->lt_id.id;
 865         elem = ltable_get_elem(&g_prepost_table, id);
 866
 867         if (!elem)
 868                 return NULL;
 869         wqp = (struct wq_prepost *)elem;
 870         if (elem->lt_id.id != id ||
 871             wqp_type(wqp) != WQP_POST ||
 872             wqp->wqp_post.wqp_next_id != head->wqp_prepostid.id) {
 873                 ltable_put_elem(&g_prepost_table, elem);
 874                 return NULL;
 875         }
 876
 877         return wqp;
 878 }
 879
 880 static void wq_prepost_reset_rnext(struct wq_prepost *wqp)
 881 {
 882         (void)lt_elem_list_break(&g_prepost_table, &wqp->wqte);
 883 }
 884
 885
 886 /**
 887  * remove 'wqp' from the prepost list on 'wqset'
 888  *
 889  * Conditions:
 890  *      wqset is locked
 891  *      caller holds a reference on wqp (and is responsible to release it)
 892  *
 893  * Result:
 894  *      wqp is invalidated, wqset is potentially updated with a new
 895  *      prepost ID, and the next element of the prepost list may be
 896  *      consumed as well (if the list contained only 2 objects)
 897  */
 898 static int wq_prepost_remove(struct waitq_set *wqset,
 899                              struct wq_prepost *wqp)
 900 {
 901         int more_posts = 1;
 902         uint64_t next_id = wqp->wqp_post.wqp_next_id;
 903         uint64_t wqp_id = wqp->wqp_prepostid.id;
 904         struct wq_prepost *prev_wqp, *next_wqp;
 905
 906         assert(wqp_type(wqp) == WQP_POST);
 907         assert(wqset->wqset_q.waitq_prepost == 1);
 908
 909         if (next_id == wqp_id) {
 910                 /* the list is singular and becoming empty */
 911                 wqset->wqset_prepost_id = 0;
 912                 more_posts = 0;
 913                 goto out;
 914         }
 915
 916         prev_wqp = wq_prepost_get_rnext(wqp);
 917         assert(prev_wqp != NULL);
 918         assert(prev_wqp->wqp_post.wqp_next_id == wqp_id);
 919         assert(prev_wqp->wqp_prepostid.id != wqp_id);
 920         assert(wqp_type(prev_wqp) == WQP_POST);
 921
 922         if (prev_wqp->wqp_prepostid.id == next_id) {
 923                 /*
 924                  * There are two items in the list, and we're removing one. We
 925                  * only need to keep the WQP_WQ pointer from 'prev_wqp'
 926                  */
 927                 wqset->wqset_prepost_id = prev_wqp->wqp_post.wqp_wq_id;
 928                 wq_prepost_invalidate(prev_wqp);
 929                 wq_prepost_put(prev_wqp);
 930                 more_posts = 0;
 931                 goto out;
 932         }
 933
 934         /* prev->next = next */
 935         prev_wqp->wqp_post.wqp_next_id = next_id;
 936
 937         /* next->prev = prev */
 938         next_wqp = wq_prepost_get(next_id);
 939         assert(next_wqp != NULL);
 940         assert(next_wqp != wqp);
 941         assert(next_wqp != prev_wqp);
 942         assert(wqp_type(next_wqp) == WQP_POST);
 943
 944         wq_prepost_reset_rnext(next_wqp);
 945         wq_prepost_rlink(next_wqp, prev_wqp);
 946
 947         /* If we remove the head of the list, update the wqset */
 948         if (wqp_id == wqset->wqset_prepost_id)
 949                 wqset->wqset_prepost_id = next_id;
 950
 951         wq_prepost_put(prev_wqp);
 952         wq_prepost_put(next_wqp);
 953
 954 out:
 955         wq_prepost_reset_rnext(wqp);
 956         wq_prepost_invalidate(wqp);
 957         return more_posts;
 958 }
 959
 960 static struct wq_prepost *wq_prepost_rfirst(uint64_t id)
 961 {
 962         struct lt_elem *elem;
 963         elem = lt_elem_list_first(&g_prepost_table, id);
 964         wqp_do_alloc_stats(elem);
 965         return (struct wq_prepost *)(void *)elem;
 966 }
 967
 968 static struct wq_prepost *wq_prepost_rpop(uint64_t *id, int type)
 969 {
 970         struct lt_elem *elem;
 971         elem = lt_elem_list_pop(&g_prepost_table, id, type);
 972         wqp_do_alloc_stats(elem);
 973         return (struct wq_prepost *)(void *)elem;
 974 }
 975
 976 static void wq_prepost_release_rlist(struct wq_prepost *wqp)
 977 {
 978         int nelem = 0;
 979         struct wqp_cache *cache;
 980         struct lt_elem *elem;
 981
 982         if (!wqp)
 983                 return;
 984
 985         elem = &wqp->wqte;
 986
 987         /*
 988          * These are reserved elements: release them back to the per-cpu pool
 989          * if our cache is running low.
 990          */
 991         disable_preemption();
 992         cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
 993         if (cache->avail < WQP_CACHE_MAX) {
 994                 struct lt_elem *tmp = NULL;
 995                 if (cache->head != LT_IDX_MAX)
 996                         tmp = lt_elem_list_first(&g_prepost_table, cache->head);
 997                 nelem = lt_elem_list_link(&g_prepost_table, elem, tmp);
 998                 cache->head = elem->lt_id.id;
 999                 cache->avail += nelem;
1000                 enable_preemption();
1001                 return;
1002         }
1003         enable_preemption();
1004
1005         /* release these elements back to the main table */
1006         nelem = lt_elem_list_release(&g_prepost_table, elem, LT_RESERVED);
1007
1008 #if CONFIG_WAITQ_STATS
1009         g_prepost_table.nreserved_releases += 1;
1010         OSDecrementAtomic64(&g_prepost_table.nreservations);
1011 #endif
1012 }
1013
1014 typedef int (*wqp_callback_func)(struct waitq_set *wqset,
1015                                  void *ctx,
1016                                  struct wq_prepost *wqp,
1017                                  struct waitq *waitq);
1018
1019 /**
1020  * iterate over a chain of preposts associated with a waitq set.
1021  *
1022  * Conditions:
1023  *      wqset is locked
1024  *
1025  * Notes:
1026  *      This loop performs automatic prepost chain management / culling, and
1027  *      may reset or adjust the waitq set's prepost ID pointer. If you don't
1028  *      want this extra processing, you can use wq_prepost_iterate().
1029  */
1030 static int wq_prepost_foreach_locked(struct waitq_set *wqset,
1031                                      void *ctx, wqp_callback_func cb)
1032 {
1033         int ret = WQ_ITERATE_SUCCESS;
1034         struct wq_prepost *wqp, *tmp_wqp;
1035
1036         assert(cb != NULL);
1037
1038         if (!wqset || !waitq_set_maybe_preposted(wqset))
1039                 return WQ_ITERATE_SUCCESS;
1040
1041 restart:
1042         wqp = wq_prepost_get(wqset->wqset_prepost_id);
1043         if (!wqp) {
1044                 /*
1045                  * The prepost object is no longer valid, reset the waitq
1046                  * set's prepost id.
1047                  */
1048                 wqset->wqset_prepost_id = 0;
1049                 return WQ_ITERATE_SUCCESS;
1050         }
1051
1052         if (wqp_type(wqp) == WQP_WQ) {
1053                 uint64_t __assert_only wqp_id = wqp->wqp_prepostid.id;
1054
1055                 ret = cb(wqset, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
1056
1057                 switch (ret) {
1058                 case WQ_ITERATE_INVALIDATE_CONTINUE:
1059                         /* the caller wants to remove the only prepost here */
1060                         assert(wqp_id == wqset->wqset_prepost_id);
1061                         wqset->wqset_prepost_id = 0;
1062                         /* fall through */
1063                 case WQ_ITERATE_CONTINUE:
1064                         wq_prepost_put(wqp);
1065                         ret = WQ_ITERATE_SUCCESS;
1066                         break;
1067                 case WQ_ITERATE_RESTART:
1068                         wq_prepost_put(wqp);
1069                         /* fall through */
1070                 case WQ_ITERATE_DROPPED:
1071                         goto restart;
1072                 default:
1073                         wq_prepost_put(wqp);
1074                         break;
1075                 }
1076                 return ret;
1077         }
1078
1079         assert(wqp->wqp_prepostid.id == wqset->wqset_prepost_id);
1080         assert(wqp_type(wqp) == WQP_POST);
1081
1082         /*
1083          * At this point we know we have a list of POST objects.
1084          * Grab a handle to the last element in the list and start
1085          * the iteration.
1086          */
1087         tmp_wqp = wq_prepost_get_rnext(wqp);
1088         assert(tmp_wqp != NULL && wqp_type(tmp_wqp) == WQP_POST);
1089
1090         uint64_t last_id = tmp_wqp->wqp_prepostid.id;
1091         wq_prepost_put(tmp_wqp);
1092
1093         ret = WQ_ITERATE_SUCCESS;
1094         for (;;) {
1095                 uint64_t wqp_id, first_id, next_id;
1096
1097                 wqp_id = wqp->wqp_prepostid.id;
1098                 first_id = wqset->wqset_prepost_id;
1099                 next_id = wqp->wqp_post.wqp_next_id;
1100
1101                 /* grab the WQP_WQ object this _POST points to */
1102                 tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
1103                 if (!tmp_wqp) {
1104                         /*
1105                          * This WQP_POST object points to an invalid
1106                          * WQP_WQ object - remove the POST object from
1107                          * the list.
1108                          */
1109                         if (wq_prepost_remove(wqset, wqp) == 0) {
1110                                 wq_prepost_put(wqp);
1111                                 goto restart;
1112                         }
1113                         goto next_prepost;
1114                 }
1115                 assert(wqp_type(tmp_wqp) == WQP_WQ);
1116                 /*
1117                  * make the callback: note that this could remove 'wqp' or
1118                  * drop the lock on our waitq set. We need to re-validate
1119                  * our state when this function returns.
1120                  */
1121                 ret = cb(wqset, ctx, wqp, tmp_wqp->wqp_wq.wqp_wq_ptr);
1122                 wq_prepost_put(tmp_wqp);
1123
1124                 switch (ret) {
1125                 case WQ_ITERATE_CONTINUE:
1126                         /* continue iteration */
1127                         break;
1128                 case WQ_ITERATE_INVALIDATE_CONTINUE:
1129                         assert(next_id == wqp->wqp_post.wqp_next_id);
1130                         if (wq_prepost_remove(wqset, wqp) == 0) {
1131                                 wq_prepost_put(wqp);
1132                                 goto restart;
1133                         }
1134                         goto next_prepost;
1135                 case WQ_ITERATE_RESTART:
1136                         wq_prepost_put(wqp);
1137                         /* fall-through */
1138                 case WQ_ITERATE_DROPPED:
1139                         /* the callback dropped the ref to wqp: just restart */
1140                         goto restart;
1141                 default:
1142                         /* break out of the iteration for some other reason */
1143                         goto finish_prepost_foreach;
1144                 }
1145
1146                 /*
1147                  * the set lock may have been dropped during callback,
1148                  * if something looks different, restart the prepost iteration
1149                  */
1150                 if (!wqp_is_valid(wqp) ||
1151                     (wqp->wqp_post.wqp_next_id != next_id) ||
1152                     wqset->wqset_prepost_id != first_id) {
1153                         wq_prepost_put(wqp);
1154                         goto restart;
1155                 }
1156
1157 next_prepost:
1158                 /* this was the last object in the list */
1159                 if (wqp_id == last_id)
1160                         break;
1161
1162                 /* get the next object */
1163                 tmp_wqp = wq_prepost_get(next_id);
1164                 if (!tmp_wqp) {
1165                         /*
1166                          * At this point we've already checked our state
1167                          * after the callback (which may have dropped the set
1168                          * lock). If we find an invalid member of the list
1169                          * then something is wrong.
1170                          */
1171                         panic("Invalid WQP_POST member 0x%llx in waitq set "
1172                               "0x%llx prepost list (first:%llx, "
1173                               "wqp:%p)",
1174                               next_id, wqset->wqset_id, first_id, wqp);
1175                 }
1176                 wq_prepost_put(wqp);
1177                 wqp = tmp_wqp;
1178
1179                 assert(wqp_type(wqp) == WQP_POST);
1180         }
1181
1182 finish_prepost_foreach:
1183         wq_prepost_put(wqp);
1184         if (ret == WQ_ITERATE_CONTINUE)
1185                 ret = WQ_ITERATE_SUCCESS;
1186
1187         return ret;
1188 }
1189
1190 /**
1191  * Perform a simple loop over a chain of prepost objects
1192  *
1193  * Conditions:
1194  *      If 'prepost_id' is associated with a waitq (set) then that object must
1195  *      be locked before calling this function.
1196  *      Callback function, 'cb', must be able to handle a NULL wqset pointer
1197  *      and a NULL waitq pointer!
1198  *
1199  * Notes:
1200  *      This prepost chain iteration will _not_ automatically adjust any chain
1201  *      element or linkage. This is the responsibility of the caller! If you
1202  *      want automatic prepost chain management (at a cost of extra CPU time),
1203  *      you can use: wq_prepost_foreach_locked().
1204  */
1205 static int wq_prepost_iterate(uint64_t prepost_id,
1206                               void *ctx, wqp_callback_func cb)
1207 {
1208         int ret;
1209         struct wq_prepost *wqp;
1210
1211         if (!prepost_id)
1212                 return WQ_ITERATE_SUCCESS;
1213
1214         wqp = wq_prepost_get(prepost_id);
1215         if (!wqp)
1216                 return WQ_ITERATE_SUCCESS;
1217
1218         if (wqp_type(wqp) == WQP_WQ) {
1219                 ret = WQ_ITERATE_SUCCESS;
1220                 if (cb)
1221                         ret = cb(NULL, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
1222
1223                 if (ret != WQ_ITERATE_DROPPED)
1224                         wq_prepost_put(wqp);
1225                 return ret;
1226         }
1227
1228         assert(wqp->wqp_prepostid.id == prepost_id);
1229         assert(wqp_type(wqp) == WQP_POST);
1230
1231         /* at this point we know we have a list of POST objects */
1232         uint64_t next_id;
1233
1234         ret = WQ_ITERATE_CONTINUE;
1235         do {
1236                 struct wq_prepost *tmp_wqp;
1237                 struct waitq *wq = NULL;
1238
1239                 next_id = wqp->wqp_post.wqp_next_id;
1240
1241                 /* grab the WQP_WQ object this _POST points to */
1242                 tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
1243                 if (tmp_wqp) {
1244                         assert(wqp_type(tmp_wqp) == WQP_WQ);
1245                         wq = tmp_wqp->wqp_wq.wqp_wq_ptr;
1246                 }
1247
1248                 if (cb)
1249                         ret = cb(NULL, ctx, wqp, wq);
1250                 if (tmp_wqp)
1251                         wq_prepost_put(tmp_wqp);
1252
1253                 if (ret != WQ_ITERATE_CONTINUE)
1254                         break;
1255
1256                 tmp_wqp = wq_prepost_get(next_id);
1257                 if (!tmp_wqp) {
1258                         /*
1259                          * the chain is broken: nothing we can do here besides
1260                          * bail from the iteration.
1261                          */
1262                         ret = WQ_ITERATE_ABORTED;
1263                         break;
1264                 }
1265
1266                 wq_prepost_put(wqp);
1267                 wqp = tmp_wqp;
1268
1269                 assert(wqp_type(wqp) == WQP_POST);
1270         } while (next_id != prepost_id);
1271
1272         if (ret != WQ_ITERATE_DROPPED)
1273                 wq_prepost_put(wqp);
1274
1275         if (ret == WQ_ITERATE_CONTINUE)
1276                 ret = WQ_ITERATE_SUCCESS;
1277         return ret;
1278 }
1279
1280
1281 struct _is_posted_ctx {
1282         struct waitq *posting_wq;
1283         int did_prepost;
1284 };
1285
1286 static int wq_is_preposted_on_set_cb(struct waitq_set *wqset, void *ctx,
1287                                      struct wq_prepost *wqp, struct waitq *waitq)
1288 {
1289         struct _is_posted_ctx *pctx = (struct _is_posted_ctx *)ctx;
1290
1291         (void)wqset;
1292         (void)wqp;
1293
1294         /*
1295          * Don't early-out, run through the _entire_ list:
1296          * This ensures that we retain a minimum number of invalid elements.
1297          */
1298         if (pctx->posting_wq == waitq)
1299                 pctx->did_prepost = 1;
1300
1301         return WQ_ITERATE_CONTINUE;
1302 }
1303
1304
1305 /**
1306  * checks if 'waitq' has already preposted on 'wqset'
1307  *
1308  * Parameters:
1309  *      waitq    The waitq that's preposting
1310  *      wqset    The set onto which waitq may be preposted
1311  *
1312  * Conditions:
1313  *      both waitq and wqset are locked
1314  *
1315  * Returns non-zero if 'waitq' has already preposted to 'wqset'
1316  */
1317 static int wq_is_preposted_on_set(struct waitq *waitq, struct waitq_set *wqset)
1318 {
1319         int ret;
1320         struct _is_posted_ctx pctx;
1321
1322         /*
1323          * If the set's only prepost matches the waitq's prepost ID,
1324          * then it obviously already preposted to the set.
1325          */
1326         if (waitq->waitq_prepost_id != 0 &&
1327             wqset->wqset_prepost_id == waitq->waitq_prepost_id)
1328                 return 1;
1329
1330         /* use full prepost iteration: always trim the list */
1331         pctx.posting_wq = waitq;
1332         pctx.did_prepost = 0;
1333         ret = wq_prepost_foreach_locked(wqset, (void *)&pctx,
1334                                         wq_is_preposted_on_set_cb);
1335         return pctx.did_prepost;
1336 }
1337
1338 static struct wq_prepost *wq_get_prepost_obj(uint64_t *reserved, int type)
1339 {
1340         struct wq_prepost *wqp = NULL;
1341         /*
1342          * don't fail just because the caller doesn't have enough
1343          * reservations, we've kept a low-water mark on the prepost table,
1344          * so there should be some available for us.
1345          */
1346         if (reserved && *reserved) {
1347                 wqp = wq_prepost_rpop(reserved, type);
1348                 assert(wqp->wqte.lt_id.idx < g_prepost_table.nelem);
1349         } else {
1350                 /*
1351                  * TODO: if in interrupt context, grab from a special
1352                  *       region / reserved list!
1353                  */
1354                 wqp = wq_prepost_alloc(type, 1);
1355         }
1356
1357         if (wqp == NULL)
1358                 panic("Couldn't allocate prepost object!");
1359         return wqp;
1360 }
1361
1362
1363 /**
1364  * prepost a waitq onto a waitq set
1365  *
1366  * Parameters:
1367  *      wqset    The set onto which waitq will be preposted
1368  *      waitq    The waitq that's preposting
1369  *      reserved List (lt_elem_list_ style) of pre-allocated prepost elements
1370  *               Could be NULL
1371  *
1372  * Conditions:
1373  *      both wqset and waitq are locked
1374  *
1375  * Notes:
1376  *      If reserved is NULL, this may block on prepost table growth.
1377  */
1378 static void wq_prepost_do_post_locked(struct waitq_set *wqset,
1379                                       struct waitq *waitq,
1380                                       uint64_t *reserved)
1381 {
1382         struct wq_prepost *wqp_post, *wqp_head, *wqp_tail;
1383
1384         assert(waitq_held(waitq) && waitq_held(&wqset->wqset_q));
1385
1386         /*
1387          * nothing to do if it's already preposted:
1388          * note that this also culls any invalid prepost objects
1389          */
1390         if (wq_is_preposted_on_set(waitq, wqset))
1391                 return;
1392
1393         /*
1394          * This function is called because an event is being posted to 'waitq'.
1395          * We need a prepost object associated with this queue. Allocate one
1396          * now if the waitq isn't already associated with one.
1397          */
1398         if (waitq->waitq_prepost_id == 0) {
1399                 struct wq_prepost *wqp;
1400                 wqp = wq_get_prepost_obj(reserved, WQP_WQ);
1401                 wqp->wqp_wq.wqp_wq_ptr = waitq;
1402                 wqp_set_valid(wqp);
1403                 waitq->waitq_prepost_id = wqp->wqp_prepostid.id;
1404                 wq_prepost_put(wqp);
1405         }
1406
1407 #if CONFIG_LTABLE_STATS
1408         g_prepost_table.npreposts += 1;
1409 #endif
1410
1411         wqdbg_v("preposting waitq %p (0x%llx) to set 0x%llx",
1412                 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq),
1413                 waitq->waitq_prepost_id, wqset->wqset_id);
1414
1415         if (wqset->wqset_prepost_id == 0) {
1416                 /* the set has no previous preposts */
1417                 wqset->wqset_prepost_id = waitq->waitq_prepost_id;
1418                 return;
1419         }
1420
1421         wqp_head = wq_prepost_get(wqset->wqset_prepost_id);
1422         if (!wqp_head) {
1423                 /* the previous prepost has become invalid */
1424                 wqset->wqset_prepost_id = waitq->waitq_prepost_id;
1425                 return;
1426         }
1427
1428         assert(wqp_head->wqp_prepostid.id == wqset->wqset_prepost_id);
1429
1430         /*
1431          * If we get here, we're going to need at least one new wq_prepost
1432          * object. If the previous wqset_prepost_id points to a WQP_WQ, we
1433          * actually need to allocate 2 wq_prepost objects because the WQP_WQ
1434          * is tied to the waitq and shared across all sets.
1435          */
1436         wqp_post = wq_get_prepost_obj(reserved, WQP_POST);
1437
1438         wqp_post->wqp_post.wqp_wq_id = waitq->waitq_prepost_id;
1439         wqdbg_v("POST 0x%llx :: WQ 0x%llx", wqp_post->wqp_prepostid.id,
1440                 waitq->waitq_prepost_id);
1441
1442         if (wqp_type(wqp_head) == WQP_WQ) {
1443                 /*
1444                  * We must replace the wqset_prepost_id with a pointer
1445                  * to two new WQP_POST objects
1446                  */
1447                 uint64_t wqp_id = wqp_head->wqp_prepostid.id;
1448                 wqdbg_v("set 0x%llx previous had 1 WQ prepost (0x%llx): "
1449                         "replacing with two POST preposts",
1450                         wqset->wqset_id, wqp_id);
1451
1452                 /* drop the old reference */
1453                 wq_prepost_put(wqp_head);
1454
1455                 /* grab another new object (the 2nd of two) */
1456                 wqp_head = wq_get_prepost_obj(reserved, WQP_POST);
1457
1458                 /* point this one to the original WQP_WQ object */
1459                 wqp_head->wqp_post.wqp_wq_id = wqp_id;
1460                 wqdbg_v("POST 0x%llx :: WQ 0x%llx",
1461                         wqp_head->wqp_prepostid.id, wqp_id);
1462
1463                 /* link it to the new wqp_post object allocated earlier */
1464                 wqp_head->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
1465                 /* make the list a double-linked and circular */
1466                 wq_prepost_rlink(wqp_head, wqp_post);
1467
1468                 /*
1469                  * Finish setting up the new prepost: point it back to the
1470                  * POST object we allocated to replace the original wqset
1471                  * WQ prepost object
1472                  */
1473                 wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
1474                 wq_prepost_rlink(wqp_post, wqp_head);
1475
1476                 /* mark objects valid, and reset the wqset prepost list head */
1477                 wqp_set_valid(wqp_head);
1478                 wqp_set_valid(wqp_post);
1479                 wqset->wqset_prepost_id = wqp_head->wqp_prepostid.id;
1480
1481                 /* release both references */
1482                 wq_prepost_put(wqp_head);
1483                 wq_prepost_put(wqp_post);
1484
1485                 wqdbg_v("set 0x%llx: 0x%llx/0x%llx -> 0x%llx/0x%llx -> 0x%llx",
1486                         wqset->wqset_id, wqset->wqset_prepost_id,
1487                         wqp_head->wqp_prepostid.id, wqp_head->wqp_post.wqp_next_id,
1488                         wqp_post->wqp_prepostid.id,
1489                         wqp_post->wqp_post.wqp_next_id);
1490                 return;
1491         }
1492
1493         assert(wqp_type(wqp_head) == WQP_POST);
1494
1495         /*
1496          * Add the new prepost to the end of the prepost list
1497          */
1498         wqp_tail = wq_prepost_get_rnext(wqp_head);
1499         assert(wqp_tail != NULL);
1500         assert(wqp_tail->wqp_post.wqp_next_id == wqset->wqset_prepost_id);
1501
1502         /*
1503          * link the head to the new tail
1504          * NOTE: this needs to happen first in case wqp_tail == wqp_head
1505          */
1506         wq_prepost_reset_rnext(wqp_head);
1507         wq_prepost_rlink(wqp_head, wqp_post);
1508
1509         /* point the new object to the list head, and list tail */
1510         wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
1511         wq_prepost_rlink(wqp_post, wqp_tail);
1512
1513         /* point the last item in the waitq set's list to the new object */
1514         wqp_tail->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
1515
1516         wqp_set_valid(wqp_post);
1517
1518         wq_prepost_put(wqp_head);
1519         wq_prepost_put(wqp_tail);
1520         wq_prepost_put(wqp_post);
1521
1522         wqdbg_v("set 0x%llx (wqp:0x%llx) last_prepost:0x%llx, "
1523                 "new_prepost:0x%llx->0x%llx", wqset->wqset_id,
1524                 wqset->wqset_prepost_id, wqp_head->wqp_prepostid.id,
1525                 wqp_post->wqp_prepostid.id, wqp_post->wqp_post.wqp_next_id);
1526
1527         return;
1528 }
1529
1530
1531 /* ----------------------------------------------------------------------
1532  *
1533  * Stats collection / reporting
1534  *
1535  * ---------------------------------------------------------------------- */
1536 #if CONFIG_LTABLE_STATS && CONFIG_WAITQ_STATS
1537 static void wq_table_stats(struct link_table *table, struct wq_table_stats *stats)
1538 {
1539         stats->version = WAITQ_STATS_VERSION;
1540         stats->table_elements = table->nelem;
1541         stats->table_used_elems = table->used_elem;
1542         stats->table_elem_sz = table->elem_sz;
1543         stats->table_slabs = table->nslabs;
1544         stats->table_slab_sz = table->slab_sz;
1545
1546         stats->table_num_allocs = table->nallocs;
1547         stats->table_num_preposts = table->npreposts;
1548         stats->table_num_reservations = table->nreservations;
1549
1550         stats->table_max_used = table->max_used;
1551         stats->table_avg_used = table->avg_used;
1552         stats->table_max_reservations = table->max_reservations;
1553         stats->table_avg_reservations = table->avg_reservations;
1554 }
1555
1556 void waitq_link_stats(struct wq_table_stats *stats)
1557 {
1558         if (!stats)
1559                 return;
1560         wq_table_stats(&g_wqlinktable, stats);
1561 }
1562
1563 void waitq_prepost_stats(struct wq_table_stats *stats)
1564 {
1565         wq_table_stats(&g_prepost_table, stats);
1566 }
1567 #endif
1568
1569
1570 /* ----------------------------------------------------------------------
1571  *
1572  * Global Wait Queues
1573  *
1574  * ---------------------------------------------------------------------- */
1575
1576 static struct waitq g_boot_waitq;
1577 static struct waitq *global_waitqs = &g_boot_waitq;
1578 static uint32_t g_num_waitqs = 1;
1579
1580 /*
1581  * Zero out the used MSBs of the event.
1582  */
1583 #define _CAST_TO_EVENT_MASK(event)   ((uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
1584
1585 static __inline__ uint32_t waitq_hash(char *key, size_t length)
1586 {
1587         uint32_t hash = jenkins_hash(key, length);
1588
1589         hash &= (g_num_waitqs - 1);
1590         return hash;
1591 }
1592
1593 /* return a global waitq pointer corresponding to the given event */
1594 struct waitq *_global_eventq(char *event, size_t event_length)
1595 {
1596         return &global_waitqs[waitq_hash(event, event_length)];
1597 }
1598
1599 /* return an indexed global waitq pointer */
1600 struct waitq *global_waitq(int index)
1601 {
1602         return &global_waitqs[index % g_num_waitqs];
1603 }
1604
1605
1606 #if CONFIG_LTABLE_STATS || CONFIG_WAITQ_STATS
1607 /* this global is for lldb */
1608 const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
1609
1610 static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)
1611 {
1612         uintptr_t buf[NWAITQ_BTFRAMES + skip];
1613         if (skip < 0)
1614                 skip = 0;
1615         memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
1616         backtrace(buf, g_nwaitq_btframes + skip);
1617         memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
1618 }
1619 #else /* no stats */
1620 #define waitq_grab_backtrace(...)
1621 #endif
1622
1623 #if CONFIG_WAITQ_STATS
1624
1625 struct wq_stats g_boot_stats;
1626 struct wq_stats *g_waitq_stats = &g_boot_stats;
1627
1628 static __inline__ struct wq_stats *waitq_global_stats(struct waitq *waitq) {
1629         struct wq_stats *wqs;
1630         uint32_t idx;
1631
1632         if (!waitq_is_global(waitq))
1633                 return NULL;
1634
1635         idx = (uint32_t)(((uintptr_t)waitq - (uintptr_t)global_waitqs) / sizeof(*waitq));
1636         assert(idx < g_num_waitqs);
1637         wqs = &g_waitq_stats[idx];
1638         return wqs;
1639 }
1640
1641 static __inline__ void waitq_stats_count_wait(struct waitq *waitq)
1642 {
1643         struct wq_stats *wqs = waitq_global_stats(waitq);
1644         if (wqs != NULL) {
1645                 wqs->waits++;
1646                 waitq_grab_backtrace(wqs->last_wait, 2);
1647         }
1648 }
1649
1650 static __inline__ void waitq_stats_count_wakeup(struct waitq *waitq)
1651 {
1652         struct wq_stats *wqs = waitq_global_stats(waitq);
1653         if (wqs != NULL) {
1654                 wqs->wakeups++;
1655                 waitq_grab_backtrace(wqs->last_wakeup, 2);
1656         }
1657 }
1658
1659 static __inline__ void waitq_stats_count_clear_wakeup(struct waitq *waitq)
1660 {
1661         struct wq_stats *wqs = waitq_global_stats(waitq);
1662         if (wqs != NULL) {
1663                 wqs->wakeups++;
1664                 wqs->clears++;
1665                 waitq_grab_backtrace(wqs->last_wakeup, 2);
1666         }
1667 }
1668
1669 static __inline__ void waitq_stats_count_fail(struct waitq *waitq)
1670 {
1671         struct wq_stats *wqs = waitq_global_stats(waitq);
1672         if (wqs != NULL) {
1673                 wqs->failed_wakeups++;
1674                 waitq_grab_backtrace(wqs->last_failed_wakeup, 2);
1675         }
1676 }
1677 #else /* !CONFIG_WAITQ_STATS */
1678 #define waitq_stats_count_wait(q)         do { } while (0)
1679 #define waitq_stats_count_wakeup(q)       do { } while (0)
1680 #define waitq_stats_count_clear_wakeup(q) do { } while (0)
1681 #define waitq_stats_count_fail(q)         do { } while (0)
1682 #endif
1683
1684 int waitq_is_valid(struct waitq *waitq)
1685 {
1686         return (waitq != NULL) && waitq->waitq_isvalid && ((waitq->waitq_type & ~1) == WQT_QUEUE);
1687 }
1688
1689 int waitq_set_is_valid(struct waitq_set *wqset)
1690 {
1691         return (wqset != NULL) && wqset->wqset_q.waitq_isvalid && waitqs_is_set(wqset);
1692 }
1693
1694 int waitq_is_global(struct waitq *waitq)
1695 {
1696         if (waitq >= global_waitqs && waitq < global_waitqs + g_num_waitqs)
1697                 return 1;
1698         return 0;
1699 }
1700
1701 int waitq_irq_safe(struct waitq *waitq)
1702 {
1703         /* global wait queues have this bit set on initialization */
1704         return waitq->waitq_irq;
1705 }
1706
1707 static uint32_t waitq_hash_size(void)
1708 {
1709         uint32_t hsize, queues;
1710
1711         if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize)))
1712                 return (hsize);
1713
1714         queues = thread_max / 5;
1715         hsize = P2ROUNDUP(queues * sizeof(struct waitq), PAGE_SIZE);
1716
1717         return hsize;
1718 }
1719
1720 void waitq_bootstrap(void)
1721 {
1722         kern_return_t kret;
1723         uint32_t whsize, qsz, tmp32;
1724
1725         g_min_free_table_elem = DEFAULT_MIN_FREE_TABLE_ELEM;
1726         if (PE_parse_boot_argn("wqt_min_free", &tmp32, sizeof(tmp32)) == TRUE)
1727                 g_min_free_table_elem = tmp32;
1728         wqdbg("Minimum free table elements: %d", tmp32);
1729
1730         /*
1731          * Determine the amount of memory we're willing to reserve for
1732          * the waitqueue hash table
1733          */
1734         whsize = waitq_hash_size();
1735
1736         /* Determine the number of waitqueues we can fit. */
1737         qsz = sizeof(struct waitq);
1738         whsize = ROUNDDOWN(whsize, qsz);
1739         g_num_waitqs = whsize / qsz;
1740
1741         /*
1742          * The hash algorithm requires that this be a power of 2, so we
1743          * just mask off all the low-order bits.
1744          */
1745         for (uint32_t i = 0; i < 31; i++) {
1746                 uint32_t bit = (1 << i);
1747                 if ((g_num_waitqs & bit) == g_num_waitqs)
1748                         break;
1749                 g_num_waitqs &= ~bit;
1750         }
1751         assert(g_num_waitqs > 0);
1752
1753         /* Now determine how much memory we really need. */
1754         whsize = P2ROUNDUP(g_num_waitqs * qsz, PAGE_SIZE);
1755
1756         wqdbg("allocating %d global queues  (%d bytes)", g_num_waitqs, whsize);
1757         kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&global_waitqs,
1758                                       whsize, 0, KMA_KOBJECT|KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
1759         if (kret != KERN_SUCCESS || global_waitqs == NULL)
1760                 panic("kernel_memory_allocate() failed to alloc global_waitqs"
1761                       ", error: %d, whsize: 0x%x", kret, whsize);
1762
1763 #if CONFIG_WAITQ_STATS
1764         whsize = P2ROUNDUP(g_num_waitqs * sizeof(struct wq_stats), PAGE_SIZE);
1765         kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&g_waitq_stats,
1766                                       whsize, 0, KMA_KOBJECT|KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
1767         if (kret != KERN_SUCCESS || global_waitqs == NULL)
1768                 panic("kernel_memory_allocate() failed to alloc g_waitq_stats"
1769                       ", error: %d, whsize: 0x%x", kret, whsize);
1770         memset(g_waitq_stats, 0, whsize);
1771 #endif
1772
1773         for (uint32_t i = 0; i < g_num_waitqs; i++) {
1774                 waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO|SYNC_POLICY_DISABLE_IRQ);
1775         }
1776
1777         waitq_set_zone = zinit(sizeof(struct waitq_set),
1778                                WAITQ_SET_MAX * sizeof(struct waitq_set),
1779                                sizeof(struct waitq_set),
1780                                "waitq sets");
1781         zone_change(waitq_set_zone, Z_NOENCRYPT, TRUE);
1782
1783         /* initialize the global waitq link table */
1784         wql_init();
1785
1786         /* initialize the global waitq prepost table */
1787         wqp_init();
1788 }
1789
1790
1791 /* ----------------------------------------------------------------------
1792  *
1793  * Wait Queue Implementation
1794  *
1795  * ---------------------------------------------------------------------- */
1796
1797 /*
1798  * Double the standard lock timeout, because wait queues tend
1799  * to iterate over a number of threads - locking each.  If there is
1800  * a problem with a thread lock, it normally times out at the wait
1801  * queue level first, hiding the real problem.
1802  */
1803 /* For x86, the hardware timeout is in TSC units. */
1804 #if defined(__i386__) || defined(__x86_64__)
1805 #define hwLockTimeOut LockTimeOutTSC
1806 #else
1807 #define hwLockTimeOut LockTimeOut
1808 #endif
1809
1810 void waitq_lock(struct waitq *wq)
1811 {
1812         if (__improbable(waitq_lock_to(wq,
1813                                     hwLockTimeOut * 2) == 0)) {
1814                 boolean_t wql_acquired = FALSE;
1815
1816                 while (machine_timeout_suspended()) {
1817                         mp_enable_preemption();
1818                         wql_acquired = waitq_lock_to(wq,
1819                                                   hwLockTimeOut * 2);
1820                         if (wql_acquired)
1821                                 break;
1822                 }
1823                 if (wql_acquired == FALSE)
1824                         panic("waitq deadlock - waitq=%p, cpu=%d\n",
1825                               wq, cpu_number());
1826         }
1827 #if defined(__x86_64__)
1828         pltrace(FALSE);
1829 #endif
1830         assert(waitq_held(wq));
1831 }
1832
1833 void waitq_unlock(struct waitq *wq)
1834 {
1835         assert(waitq_held(wq));
1836 #if defined(__x86_64__)
1837         pltrace(TRUE);
1838 #endif
1839         waitq_lock_unlock(wq);
1840 }
1841
1842
1843 /**
1844  * clear the thread-related waitq state
1845  *
1846  * Conditions:
1847  *      'thread' is locked
1848  */
1849 static inline void thread_clear_waitq_state(thread_t thread)
1850 {
1851         thread->waitq = NULL;
1852         thread->wait_event = NO_EVENT64;
1853         thread->at_safe_point = FALSE;
1854 }
1855
1856
1857 typedef thread_t (*waitq_select_cb)(void *ctx, struct waitq *waitq,
1858                                     int is_global, thread_t thread);
1859
1860 struct waitq_select_args {
1861         /* input parameters */
1862         struct waitq    *posted_waitq;
1863         struct waitq    *waitq;
1864         event64_t        event;
1865         waitq_select_cb  select_cb;
1866         void            *select_ctx;
1867
1868         uint64_t        *reserved_preposts;
1869
1870         /* output parameters */
1871         queue_t       threadq;
1872         int           max_threads;
1873         int          *nthreads;
1874         spl_t        *spl;
1875 };
1876
1877 static void do_waitq_select_n_locked(struct waitq_select_args *args);
1878
1879 /**
1880  * callback invoked once for every waitq set to which a waitq belongs
1881  *
1882  * Conditions:
1883  *      ctx->posted_waitq is locked
1884  *      'link' points to a valid waitq set
1885  *
1886  * Notes:
1887  *      Takes the waitq set lock on the set pointed to by 'link'
1888  *      Calls do_waitq_select_n_locked() which could recurse back into
1889  *      this function if the waitq set is a member of other sets.
1890  *      If no threads were selected, it preposts the input waitq
1891  *      onto the waitq set pointed to by 'link'.
1892  */
1893 static int waitq_select_walk_cb(struct waitq *waitq, void *ctx,
1894                                 struct waitq_link *link)
1895 {
1896         int ret = WQ_ITERATE_CONTINUE;
1897         struct waitq_select_args args = *((struct waitq_select_args *)ctx);
1898         struct waitq_set *wqset;
1899
1900         (void)waitq;
1901         assert(wql_type(link) == WQL_WQS);
1902
1903         wqset = link->wql_wqs.wql_set;
1904         args.waitq = &wqset->wqset_q;
1905
1906         assert(!waitq_irq_safe(waitq));
1907         assert(!waitq_irq_safe(&wqset->wqset_q));
1908
1909         waitq_set_lock(wqset);
1910         /*
1911          * verify that the link wasn't invalidated just before
1912          * we were able to take the lock.
1913          */
1914         if (wqset->wqset_id != link->wql_setid.id)
1915                 goto out_unlock;
1916
1917         /*
1918          * Find any threads waiting on this wait queue set,
1919          * and recurse into any waitq set to which this set belongs.
1920          */
1921         do_waitq_select_n_locked(&args);
1922
1923         if (*(args.nthreads) > 0 ||
1924             (args.threadq && !queue_empty(args.threadq))) {
1925                 /* at least 1 thread was selected and returned: don't prepost */
1926                 if (args.max_threads > 0 &&
1927                     *(args.nthreads) >= args.max_threads) {
1928                         /* break out of the setid walk */
1929                         ret = WQ_ITERATE_FOUND;
1930                 }
1931                 goto out_unlock;
1932         } else {
1933                 /*
1934                  * No thread selected: prepost 'waitq' to 'wqset'
1935                  * if wqset can handle preposts and the event is set to 0.
1936                  * We also make sure to not post waitq sets to other sets.
1937                  *
1938                  * If the set doesn't support preposts, but does support
1939                  * prepost callout/hook interaction, invoke the predefined
1940                  * callout function and pass the set's 'prepost_hook.' This
1941                  * could potentially release another thread to handle events.
1942                  */
1943                 if (args.event == NO_EVENT64) {
1944                         if (waitq_set_can_prepost(wqset)) {
1945                                 wq_prepost_do_post_locked(
1946                                         wqset, waitq, args.reserved_preposts);
1947                         } else if (waitq_set_has_prepost_hook(wqset)) {
1948                                 waitq_set__CALLING_PREPOST_HOOK__(
1949                                         wqset->wqset_prepost_hook, waitq, 0);
1950                         }
1951                 }
1952         }
1953
1954 out_unlock:
1955         waitq_set_unlock(wqset);
1956         return ret;
1957 }
1958
1959 /**
1960  * generic thread selection from a waitq (and sets to which the waitq belongs)
1961  *
1962  * Conditions:
1963  *      args->waitq (and args->posted_waitq) is locked
1964  *
1965  * Notes:
1966  *      Uses the optional select callback function to refine the selection
1967  *      of one or more threads from a waitq and any set to which the waitq
1968  *      belongs. The select callback is invoked once for every thread that
1969  *      is found to be waiting on the input args->waitq.
1970  *
1971  *      If one or more threads are selected, this may disable interrupts.
1972  *      The previous interrupt state is returned in args->spl and should
1973  *      be used in a call to splx() if threads are returned to the caller.
1974  */
1975 static void do_waitq_select_n_locked(struct waitq_select_args *args)
1976 {
1977         struct waitq *waitq = args->waitq;
1978         int max_threads = args->max_threads;
1979         thread_t thread = THREAD_NULL, first_thread = THREAD_NULL;
1980         struct waitq *safeq;
1981         uint32_t remaining_eventmask = 0;
1982         uint32_t eventmask;
1983         int *nthreads = args->nthreads;
1984         spl_t spl = 0;
1985
1986         assert(max_threads != 0);
1987
1988         if (!waitq_irq_safe(waitq)) {
1989                 /* JMM - add flag to waitq to avoid global lookup if no waiters */
1990                 eventmask = _CAST_TO_EVENT_MASK(waitq);
1991                 safeq = global_eventq(waitq);
1992                 if (*nthreads == 0)
1993                         spl = splsched();
1994                 waitq_lock(safeq);
1995         } else {
1996                 eventmask = _CAST_TO_EVENT_MASK(args->event);
1997                 safeq = waitq;
1998         }
1999
2000         /*
2001          * If the safeq doesn't have an eventmask (not global) or the event
2002          * we're looking for IS set in its eventmask, then scan the threads
2003          * in that queue for ones that match the original <waitq,event> pair.
2004          */
2005         if (!waitq_is_global(safeq) ||
2006             (safeq->waitq_eventmask & eventmask) == eventmask) {
2007
2008                 /* look through each thread waiting directly on the safeq */
2009                 qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
2010                         thread_t t = THREAD_NULL;
2011                         assert_thread_magic(thread);
2012
2013                         if (thread->waitq == waitq && thread->wait_event == args->event) {
2014                                 t = thread;
2015                                 if (first_thread == THREAD_NULL)
2016                                         first_thread = thread;
2017
2018                                 /* allow the caller to futher refine the selection */
2019                                 if (args->select_cb)
2020                                         t = args->select_cb(args->select_ctx, waitq,
2021                                                             waitq_is_global(waitq), thread);
2022                                 if (t != THREAD_NULL) {
2023                                         *nthreads += 1;
2024                                         if (args->threadq) {
2025                                                 if (*nthreads == 1)
2026                                                         *(args->spl) = (safeq != waitq) ? spl : splsched();
2027                                                 thread_lock(t);
2028                                                 thread_clear_waitq_state(t);
2029                                                 /* put locked thread on output queue */
2030                                                 re_queue_tail(args->threadq, &t->wait_links);
2031                                         }
2032                                         /* only enqueue up to 'max' threads */
2033                                         if (*nthreads >= max_threads && max_threads > 0)
2034                                                 break;
2035                                 }
2036                         }
2037                         /* thread wasn't selected so track it's event */
2038                         if (t == THREAD_NULL) {
2039                                 remaining_eventmask |= (thread->waitq != safeq) ?
2040                                     _CAST_TO_EVENT_MASK(thread->waitq):
2041                                     _CAST_TO_EVENT_MASK(thread->wait_event);
2042                         }
2043                 }
2044
2045                 /*
2046                  * Update the eventmask of global queues we just scanned:
2047                  * - If we selected all the threads in the queue, we can clear its
2048                  *   eventmask.
2049                  *
2050                  * - If we didn't find enough threads to fill our needs, then we can
2051                  *   assume we looked at every thread in the queue and the mask we
2052                  *   computed is complete - so reset it.
2053                  */
2054                 if (waitq_is_global(safeq)) {
2055                         if (queue_empty(&safeq->waitq_queue))
2056                                 safeq->waitq_eventmask = 0;
2057                         else if (max_threads < 0 || *nthreads < max_threads)
2058                                 safeq->waitq_eventmask = remaining_eventmask;
2059                 }
2060         }
2061
2062         /*
2063          * Grab the first thread in the queue if no other thread was selected.
2064          * We can guarantee that no one has manipulated this thread because
2065          * it's waiting on the given waitq, and we have that waitq locked.
2066          */
2067         if (*nthreads == 0 && first_thread != THREAD_NULL && args->threadq) {
2068                 /* we know this is the first (and only) thread */
2069                 ++(*nthreads);
2070                 *(args->spl) = (safeq != waitq) ? spl : splsched();
2071                 thread_lock(first_thread);
2072                 thread_clear_waitq_state(first_thread);
2073                 re_queue_tail(args->threadq, &first_thread->wait_links);
2074
2075                 /* update the eventmask on [now] empty global queues */
2076                 if (waitq_is_global(safeq) && queue_empty(&safeq->waitq_queue))
2077                         safeq->waitq_eventmask = 0;
2078         }
2079
2080         /* unlock the safe queue if we locked one above */
2081         if (safeq != waitq) {
2082                 waitq_unlock(safeq);
2083                 if (*nthreads == 0)
2084                         splx(spl);
2085         }
2086
2087         if (max_threads > 0 && *nthreads >= max_threads)
2088                 return;
2089
2090         /*
2091          * wait queues that are not in any sets
2092          * are the bottom of the recursion
2093          */
2094         if (!waitq->waitq_set_id)
2095                 return;
2096
2097         /* check to see if the set ID for this wait queue is valid */
2098         struct waitq_link *link = wql_get_link(waitq->waitq_set_id);
2099         if (!link) {
2100                 /* the waitq set to which this waitq belonged, has been invalidated */
2101                 waitq->waitq_set_id = 0;
2102                 return;
2103         }
2104
2105         wql_put_link(link);
2106
2107         /*
2108          * If this waitq is a member of any wait queue sets, we need to look
2109          * for waiting thread(s) in any of those sets, and prepost all sets that
2110          * don't have active waiters.
2111          *
2112          * Note that we do a local walk of this waitq's links - we manually
2113          * recurse down wait queue set's with non-zero wqset_q.waitq_set_id
2114          */
2115         (void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
2116                                WQL_WQS, (void *)args, waitq_select_walk_cb);
2117 }
2118
2119 /**
2120  * main entry point for thread selection from a waitq
2121  *
2122  * Conditions:
2123  *      waitq is locked
2124  *
2125  * Returns:
2126  *      The number of threads waiting on 'waitq' for 'event' which have
2127  *      been placed onto the input 'threadq'
2128  *
2129  * Notes:
2130  *      The 'select_cb' function is invoked for every thread found waiting
2131  *      on 'waitq' for 'event'. The thread is _not_ locked upon callback
2132  *      invocation. This parameter may be NULL.
2133  *
2134  *      If one or more threads are returned in 'threadq' then the caller is
2135  *      responsible to call splx() using the returned 'spl' value. Each
2136  *      returned thread is locked.
2137  */
2138 static __inline__ int waitq_select_n_locked(struct waitq *waitq,
2139                                             event64_t event,
2140                                             waitq_select_cb select_cb,
2141                                             void *select_ctx,
2142                                             uint64_t *reserved_preposts,
2143                                             queue_t threadq,
2144                                             int max_threads, spl_t *spl)
2145 {
2146         int nthreads = 0;
2147
2148         struct waitq_select_args args = {
2149                 .posted_waitq = waitq,
2150                 .waitq = waitq,
2151                 .event = event,
2152                 .select_cb = select_cb,
2153                 .select_ctx = select_ctx,
2154                 .reserved_preposts = reserved_preposts,
2155                 .threadq = threadq,
2156                 .max_threads = max_threads,
2157                 .nthreads = &nthreads,
2158                 .spl = spl,
2159         };
2160
2161         do_waitq_select_n_locked(&args);
2162         return nthreads;
2163 }
2164
2165 /**
2166  * select from a waitq a single thread waiting for a given event
2167  *
2168  * Conditions:
2169  *      'waitq' is locked
2170  *
2171  * Returns:
2172  *      A locked thread that's been removed from the waitq, but has not
2173  *      yet been put on a run queue. Caller is responsible to call splx
2174  *      with the '*spl' value.
2175  */
2176 static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,
2177                                         uint64_t *reserved_preposts,
2178                                         int priority, spl_t *spl)
2179 {
2180         (void)priority;
2181         int nthreads;
2182         queue_head_t threadq;
2183
2184         queue_init(&threadq);
2185
2186         nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
2187                                          reserved_preposts, &threadq, 1, spl);
2188
2189         /* if we selected a thread, return it (still locked) */
2190         if (!queue_empty(&threadq)) {
2191                 thread_t t;
2192                 queue_entry_t qe = dequeue_head(&threadq);
2193                 t = qe_element(qe, struct thread, wait_links);
2194                 assert(queue_empty(&threadq)); /* there should be 1 entry */
2195                 /* t has been locked and removed from all queues */
2196                 return t;
2197         }
2198
2199         return THREAD_NULL;
2200 }
2201
2202 struct find_max_pri_ctx {
2203         integer_t max_sched_pri;
2204         integer_t max_base_pri;
2205         thread_t highest_thread;
2206 };
2207
2208 /**
2209  * callback function that finds the max priority thread
2210  *
2211  * Conditions:
2212  *      'waitq' is locked
2213  *      'thread' is not locked
2214  */
2215 static thread_t
2216 waitq_find_max_pri_cb(void         *ctx_in,
2217              __unused struct waitq *waitq,
2218              __unused int           is_global,
2219                       thread_t      thread)
2220 {
2221         struct find_max_pri_ctx *ctx = (struct find_max_pri_ctx *)ctx_in;
2222
2223         /*
2224          * thread is not locked, use pri as a hint only
2225          * wake up the highest base pri, and find the highest sched pri at that base pri
2226          */
2227         integer_t sched_pri = *(volatile int16_t *)&thread->sched_pri;
2228         integer_t base_pri  = *(volatile int16_t *)&thread->base_pri;
2229
2230         if (ctx->highest_thread == THREAD_NULL ||
2231             (base_pri > ctx->max_base_pri) ||
2232             (base_pri == ctx->max_base_pri && sched_pri > ctx->max_sched_pri)) {
2233                 /* don't select the thread, just update ctx */
2234
2235                 ctx->max_sched_pri  = sched_pri;
2236                 ctx->max_base_pri   = base_pri;
2237                 ctx->highest_thread = thread;
2238         }
2239
2240         return THREAD_NULL;
2241 }
2242
2243 /**
2244  * select from a waitq the highest priority thread waiting for a given event
2245  *
2246  * Conditions:
2247  *      'waitq' is locked
2248  *
2249  * Returns:
2250  *      A locked thread that's been removed from the waitq, but has not
2251  *      yet been put on a run queue. Caller is responsible to call splx
2252  *      with the '*spl' value.
2253  */
2254 static thread_t
2255 waitq_select_max_locked(struct waitq *waitq, event64_t event,
2256                         uint64_t *reserved_preposts,
2257                         spl_t *spl)
2258 {
2259         __assert_only int nthreads;
2260         assert(!waitq->waitq_set_id); /* doesn't support recursive sets */
2261
2262         struct find_max_pri_ctx ctx = {
2263                 .max_sched_pri = 0,
2264                 .max_base_pri = 0,
2265                 .highest_thread = THREAD_NULL,
2266         };
2267
2268         /*
2269          * Scan the waitq to find the highest priority thread.
2270          * This doesn't remove any thread from the queue
2271          */
2272         nthreads = waitq_select_n_locked(waitq, event, waitq_find_max_pri_cb, &ctx,
2273                                          reserved_preposts, NULL, 1, spl);
2274
2275         assert(nthreads == 0);
2276
2277         if (ctx.highest_thread != THREAD_NULL) {
2278                 __assert_only kern_return_t ret;
2279
2280                 /* Remove only the thread we just found */
2281                 ret = waitq_select_thread_locked(waitq, event, ctx.highest_thread, spl);
2282
2283                 assert(ret == KERN_SUCCESS);
2284                 return ctx.highest_thread;
2285         }
2286
2287         return THREAD_NULL;
2288 }
2289
2290
2291 struct select_thread_ctx {
2292         thread_t      thread;
2293         event64_t     event;
2294         spl_t        *spl;
2295 };
2296
2297 /**
2298  * link walk callback invoked once for each set to which a waitq belongs
2299  *
2300  * Conditions:
2301  *      initial waitq is locked
2302  *      ctx->thread is unlocked
2303  *
2304  * Notes:
2305  *      This may disable interrupts and early-out of the full DAG link walk by
2306  *      returning KERN_ALREADY_IN_SET. In this case, the returned thread has
2307  *      been removed from the waitq, it's waitq state has been reset, and the
2308  *      caller is responsible to call splx() with the returned interrupt state
2309  *      in ctx->spl.
2310  */
2311 static int waitq_select_thread_cb(struct waitq *waitq, void *ctx,
2312                                   struct waitq_link *link)
2313 {
2314         struct select_thread_ctx *stctx = (struct select_thread_ctx *)ctx;
2315         struct waitq_set *wqset;
2316         struct waitq *wqsetq;
2317         struct waitq *safeq;
2318         spl_t s;
2319
2320         (void)waitq;
2321
2322         thread_t thread = stctx->thread;
2323         event64_t event = stctx->event;
2324
2325         if (wql_type(link) != WQL_WQS)
2326                 return WQ_ITERATE_CONTINUE;
2327
2328         wqset = link->wql_wqs.wql_set;
2329         wqsetq = &wqset->wqset_q;
2330
2331         assert(!waitq_irq_safe(waitq));
2332         assert(!waitq_irq_safe(wqsetq));
2333
2334         waitq_set_lock(wqset);
2335
2336         s = splsched();
2337
2338         /* find and lock the interrupt-safe waitq the thread is thought to be on */
2339         safeq = global_eventq(wqsetq);
2340         waitq_lock(safeq);
2341
2342         thread_lock(thread);
2343
2344         if ((thread->waitq == wqsetq) && (thread->wait_event == event)) {
2345                 remqueue(&thread->wait_links);
2346                 if (queue_empty(&safeq->waitq_queue)) {
2347                         safeq->waitq_eventmask = 0;
2348                 }
2349                 thread_clear_waitq_state(thread);
2350                 waitq_unlock(safeq);
2351                 waitq_set_unlock(wqset);
2352                 /*
2353                  * thread still locked,
2354                  * return non-zero to break out of WQS walk
2355                  */
2356                 *(stctx->spl) = s;
2357                 return WQ_ITERATE_FOUND;
2358         }
2359
2360         thread_unlock(thread);
2361         waitq_set_unlock(wqset);
2362         waitq_unlock(safeq);
2363         splx(s);
2364
2365         return WQ_ITERATE_CONTINUE;
2366 }
2367
2368 /**
2369  * returns KERN_SUCCESS and locks 'thread' if-and-only-if 'thread' is waiting
2370  * on 'waitq' (or any set to which waitq belongs) for 'event'
2371  *
2372  * Conditions:
2373  *      'waitq' is locked
2374  *      'thread' is unlocked
2375  */
2376 static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
2377                                                 event64_t event,
2378                                                 thread_t thread, spl_t *spl)
2379 {
2380         struct waitq *safeq;
2381         struct waitq_link *link;
2382         struct select_thread_ctx ctx;
2383         kern_return_t kr;
2384         spl_t s;
2385
2386         s = splsched();
2387
2388         /* Find and lock the interrupts disabled queue the thread is actually on */
2389         if (!waitq_irq_safe(waitq)) {
2390                 safeq = global_eventq(waitq);
2391                 waitq_lock(safeq);
2392         } else {
2393                 safeq = waitq;
2394         }
2395
2396         thread_lock(thread);
2397
2398         if ((thread->waitq == waitq) && (thread->wait_event == event)) {
2399                 remqueue(&thread->wait_links);
2400                 if (queue_empty(&safeq->waitq_queue)) {
2401                         safeq->waitq_eventmask = 0;
2402                 }
2403                 thread_clear_waitq_state(thread);
2404                 *spl = s;
2405                 /* thread still locked */
2406                 return KERN_SUCCESS;
2407         }
2408
2409         thread_unlock(thread);
2410
2411         if (safeq != waitq)
2412                 waitq_unlock(safeq);
2413
2414         splx(s);
2415
2416         if (!waitq->waitq_set_id)
2417                 return KERN_NOT_WAITING;
2418
2419         /* check to see if the set ID for this wait queue is valid */
2420         link = wql_get_link(waitq->waitq_set_id);
2421         if (!link) {
2422                 /* the waitq to which this set belonged, has been invalidated */
2423                 waitq->waitq_set_id = 0;
2424                 return KERN_NOT_WAITING;
2425         }
2426
2427         /*
2428          * The thread may be waiting on a wait queue set to which
2429          * the input 'waitq' belongs. Go look for the thread in
2430          * all wait queue sets. If it's there, we'll remove it
2431          * because it's equivalent to waiting directly on the input waitq.
2432          */
2433         ctx.thread = thread;
2434         ctx.event = event;
2435         ctx.spl = spl;
2436         kr = walk_waitq_links(LINK_WALK_FULL_DAG, waitq, waitq->waitq_set_id,
2437                               WQL_WQS, (void *)&ctx, waitq_select_thread_cb);
2438
2439         wql_put_link(link);
2440
2441         /* we found a thread, return success */
2442         if (kr == WQ_ITERATE_FOUND)
2443                 return KERN_SUCCESS;
2444
2445         return KERN_NOT_WAITING;
2446 }
2447
2448 static int prepost_exists_cb(struct waitq_set __unused *wqset,
2449                              void __unused *ctx,
2450                              struct wq_prepost __unused *wqp,
2451                              struct waitq __unused *waitq)
2452 {
2453         /* if we get here, then we know that there is a valid prepost object! */
2454         return WQ_ITERATE_FOUND;
2455 }
2456
2457 /**
2458  * declare a thread's intent to wait on 'waitq' for 'wait_event'
2459  *
2460  * Conditions:
2461  *      'waitq' is locked
2462  */
2463 wait_result_t waitq_assert_wait64_locked(struct waitq *waitq,
2464                                           event64_t wait_event,
2465                                           wait_interrupt_t interruptible,
2466                                           wait_timeout_urgency_t urgency,
2467                                           uint64_t deadline,
2468                                           uint64_t leeway,
2469                                           thread_t thread)
2470 {
2471         wait_result_t wait_result;
2472         int realtime = 0;
2473         struct waitq *safeq;
2474         uintptr_t eventmask;
2475         spl_t s;
2476
2477
2478         /*
2479          * Warning: Do _not_ place debugging print statements here.
2480          *          The waitq is locked!
2481          */
2482         assert(!thread->started || thread == current_thread());
2483
2484         if (thread->waitq != NULL)
2485                 panic("thread already waiting on %p", thread->waitq);
2486
2487         if (waitq_is_set(waitq)) {
2488                 struct waitq_set *wqset = (struct waitq_set *)waitq;
2489                 /*
2490                  * early-out if the thread is waiting on a wait queue set
2491                  * that has already been pre-posted.
2492                  */
2493                 if (wait_event == NO_EVENT64 && waitq_set_maybe_preposted(wqset)) {
2494                         int ret;
2495                         /*
2496                          * Run through the list of potential preposts. Because
2497                          * this is a hot path, we short-circuit the iteration
2498                          * if we find just one prepost object.
2499                          */
2500                         ret = wq_prepost_foreach_locked(wqset, NULL,
2501                                                         prepost_exists_cb);
2502                         if (ret == WQ_ITERATE_FOUND) {
2503                                 s = splsched();
2504                                 thread_lock(thread);
2505                                 thread->wait_result = THREAD_AWAKENED;
2506                                 thread_unlock(thread);
2507                                 splx(s);
2508                                 return THREAD_AWAKENED;
2509                         }
2510                 }
2511         }
2512
2513         s = splsched();
2514
2515         /*
2516          * If already dealing with an irq safe wait queue, we are all set.
2517          * Otherwise, determine a global queue to use and lock it.
2518          */
2519         if (!waitq_irq_safe(waitq)) {
2520                 safeq = global_eventq(waitq);
2521                 eventmask = _CAST_TO_EVENT_MASK(waitq);
2522                 waitq_lock(safeq);
2523         } else {
2524                 safeq = waitq;
2525                 eventmask = _CAST_TO_EVENT_MASK(wait_event);
2526         }
2527
2528         /* lock the thread now that we have the irq-safe waitq locked */
2529         thread_lock(thread);
2530
2531         /*
2532          * Realtime threads get priority for wait queue placements.
2533          * This allows wait_queue_wakeup_one to prefer a waiting
2534          * realtime thread, similar in principle to performing
2535          * a wait_queue_wakeup_all and allowing scheduler prioritization
2536          * to run the realtime thread, but without causing the
2537          * lock contention of that scenario.
2538          */
2539         if (thread->sched_pri >= BASEPRI_REALTIME)
2540                 realtime = 1;
2541
2542         /*
2543          * This is the extent to which we currently take scheduling attributes
2544          * into account.  If the thread is vm priviledged, we stick it at
2545          * the front of the queue.  Later, these queues will honor the policy
2546          * value set at waitq_init time.
2547          */
2548         wait_result = thread_mark_wait_locked(thread, interruptible);
2549         /* thread->wait_result has been set */
2550         if (wait_result == THREAD_WAITING) {
2551
2552                 if (!safeq->waitq_fifo
2553                     || (thread->options & TH_OPT_VMPRIV) || realtime)
2554                         enqueue_head(&safeq->waitq_queue, &thread->wait_links);
2555                 else
2556                         enqueue_tail(&safeq->waitq_queue, &thread->wait_links);
2557
2558                 /* mark the event and real waitq, even if enqueued on a global safeq */
2559                 thread->wait_event = wait_event;
2560                 thread->waitq = waitq;
2561
2562                 if (deadline != 0) {
2563                         boolean_t act;
2564
2565                         act = timer_call_enter_with_leeway(&thread->wait_timer,
2566                                                            NULL,
2567                                                            deadline, leeway,
2568                                                            urgency, FALSE);
2569                         if (!act)
2570                                 thread->wait_timer_active++;
2571                         thread->wait_timer_is_set = TRUE;
2572                 }
2573
2574                 if (waitq_is_global(safeq))
2575                         safeq->waitq_eventmask |= eventmask;
2576
2577                 waitq_stats_count_wait(waitq);
2578         }
2579
2580         /* unlock the thread */
2581         thread_unlock(thread);
2582
2583         /* unlock the safeq if we locked it here */
2584         if (safeq != waitq) {
2585                 waitq_unlock(safeq);
2586         }
2587
2588         splx(s);
2589
2590         return wait_result;
2591 }
2592
2593 /**
2594  * remove 'thread' from its current blocking state on 'waitq'
2595  *
2596  * Conditions:
2597  *      'thread' is locked
2598  *
2599  * Notes:
2600  *      This function is primarily used by clear_wait_internal in
2601  *      sched_prim.c from the thread timer wakeup path
2602  *      (i.e. the thread was waiting on 'waitq' with a timeout that expired)
2603  */
2604 int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
2605 {
2606         struct waitq *safeq;
2607
2608         assert_thread_magic(thread);
2609         assert(thread->waitq == waitq);
2610
2611         /* Find the interrupts disabled queue thread is waiting on */
2612         if (!waitq_irq_safe(waitq)) {
2613                 safeq = global_eventq(waitq);
2614         } else {
2615                 safeq = waitq;
2616         }
2617
2618         /* thread is already locked so have to try for the waitq lock */
2619         if (!waitq_lock_try(safeq))
2620                 return 0;
2621
2622         remqueue(&thread->wait_links);
2623         thread_clear_waitq_state(thread);
2624         waitq_stats_count_clear_wakeup(waitq);
2625
2626         /* clear the global event mask if this was the last thread there! */
2627         if (waitq_is_global(safeq) && queue_empty(&safeq->waitq_queue)) {
2628                 safeq->waitq_eventmask = 0;
2629                 /* JMM - also mark no-waiters on waitq (if not the same as the safeq) */
2630         }
2631
2632         waitq_unlock(safeq);
2633
2634         return 1;
2635 }
2636
2637
2638 static __inline__
2639 void maybe_adjust_thread_pri(thread_t thread, int priority) {
2640         if (thread->sched_pri < priority) {
2641                 if (priority <= MAXPRI) {
2642                         set_sched_pri(thread, priority);
2643
2644                         thread->was_promoted_on_wakeup = 1;
2645                         thread->sched_flags |= TH_SFLAG_PROMOTED;
2646                 }
2647                 return;
2648         }
2649
2650         /*
2651          * If the caller is requesting the waitq subsystem to promote the
2652          * priority of the awoken thread, then boost the thread's priority to
2653          * the default WAITQ_BOOST_PRIORITY (if it's not already equal or
2654          * higher priority).  This boost must be removed via a call to
2655          * waitq_clear_promotion_locked.
2656          */
2657         if (priority == WAITQ_PROMOTE_PRIORITY &&
2658             (thread->sched_pri < WAITQ_BOOST_PRIORITY ||
2659              !(thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED))) {
2660
2661                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_PROMOTE) | DBG_FUNC_NONE,
2662                                       (uintptr_t)thread_tid(thread),
2663                                       thread->sched_pri, thread->base_pri,
2664                                       WAITQ_BOOST_PRIORITY, 0);
2665                 thread->sched_flags |= TH_SFLAG_WAITQ_PROMOTED;
2666                 if (thread->sched_pri < WAITQ_BOOST_PRIORITY)
2667                         set_sched_pri(thread, WAITQ_BOOST_PRIORITY);
2668         }
2669 }
2670
2671 /**
2672  * Clear a thread's waitq priority promotion state and the waitq's boost flag
2673  *
2674  * This function will always clear the waitq's 'waitq_boost' flag. If the
2675  * 'thread' parameter is non-null, the this function will also check the
2676  * priority promotion (boost) state of that thread. If this thread was boosted
2677  * (by having been awoken from a boosting waitq), then this boost state is
2678  * cleared. This function is to be paired with waitq_enable_promote_locked.
2679  */
2680 void waitq_clear_promotion_locked(struct waitq *waitq, thread_t thread)
2681 {
2682         spl_t s;
2683
2684         assert(waitq_held(waitq));
2685         if (thread == THREAD_NULL)
2686                 return;
2687
2688         if (!waitq_irq_safe(waitq))
2689                 s = splsched();
2690         thread_lock(thread);
2691
2692         if (thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) {
2693                 thread->sched_flags &= ~TH_SFLAG_WAITQ_PROMOTED;
2694
2695                 if (thread->sched_flags & TH_SFLAG_PROMOTED_MASK) {
2696                         /* it still has other promotions (mutex/rw_lock) */
2697                 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
2698                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE) | DBG_FUNC_NONE,
2699                                               (uintptr_t)thread_tid(thread),
2700                                               thread->sched_pri,
2701                                               thread->base_pri,
2702                                               DEPRESSPRI, 0);
2703                         set_sched_pri(thread, DEPRESSPRI);
2704                 } else {
2705                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE) | DBG_FUNC_NONE,
2706                                               (uintptr_t)thread_tid(thread),
2707                                               thread->sched_pri,
2708                                               thread->base_pri,
2709                                               thread->base_pri, 0);
2710                         thread_recompute_sched_pri(thread, FALSE);
2711                 }
2712         }
2713
2714         thread_unlock(thread);
2715         if (!waitq_irq_safe(waitq))
2716                 splx(s);
2717 }
2718
2719 /**
2720  * wakeup all threads waiting on 'waitq' for 'wake_event'
2721  *
2722  * Conditions:
2723  *      'waitq' is locked
2724  *
2725  * Notes:
2726  *      May temporarily disable and re-enable interrupts
2727  *      and re-adjust thread priority of each awoken thread.
2728  *
2729  *      If the input 'lock_state' == WAITQ_UNLOCK then the waitq will have
2730  *      been unlocked before calling thread_go() on any returned threads, and
2731  *      is guaranteed to be unlocked upon function return.
2732  */
2733 kern_return_t waitq_wakeup64_all_locked(struct waitq *waitq,
2734                                         event64_t wake_event,
2735                                         wait_result_t result,
2736                                         uint64_t *reserved_preposts,
2737                                         int priority,
2738                                         waitq_lock_state_t lock_state)
2739 {
2740         kern_return_t ret;
2741         thread_t thread;
2742         spl_t th_spl;
2743         int nthreads;
2744         queue_head_t wakeup_queue;
2745
2746         assert(waitq_held(waitq));
2747         queue_init(&wakeup_queue);
2748
2749         nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
2750                                          reserved_preposts,
2751                                          &wakeup_queue, -1, &th_spl);
2752
2753         /* set each thread running */
2754         ret = KERN_NOT_WAITING;
2755
2756 #if CONFIG_WAITQ_STATS
2757         qe_foreach_element(thread, &wakeup_queue, wait_links)
2758                 waitq_stats_count_wakeup(waitq);
2759 #endif
2760         if (lock_state == WAITQ_UNLOCK)
2761                 waitq_unlock(waitq);
2762
2763         qe_foreach_element_safe(thread, &wakeup_queue, wait_links) {
2764                 assert_thread_magic(thread);
2765                 remqueue(&thread->wait_links);
2766                 maybe_adjust_thread_pri(thread, priority);
2767                 ret = thread_go(thread, result);
2768                 assert(ret == KERN_SUCCESS);
2769                 thread_unlock(thread);
2770         }
2771         if (nthreads > 0)
2772                 splx(th_spl);
2773         else
2774                 waitq_stats_count_fail(waitq);
2775
2776         return ret;
2777 }
2778
2779 /**
2780  * wakeup one thread waiting on 'waitq' for 'wake_event'
2781  *
2782  * Conditions:
2783  *      'waitq' is locked
2784  *
2785  * Notes:
2786  *      May temporarily disable and re-enable interrupts.
2787  */
2788 kern_return_t waitq_wakeup64_one_locked(struct waitq *waitq,
2789                                         event64_t wake_event,
2790                                         wait_result_t result,
2791                                         uint64_t *reserved_preposts,
2792                                         int priority,
2793                                         waitq_lock_state_t lock_state)
2794 {
2795         thread_t thread;
2796         spl_t th_spl;
2797
2798         assert(waitq_held(waitq));
2799
2800         if (priority == WAITQ_SELECT_MAX_PRI) {
2801                 thread = waitq_select_max_locked(waitq, wake_event,
2802                                                  reserved_preposts,
2803                                                  &th_spl);
2804         } else {
2805                 thread = waitq_select_one_locked(waitq, wake_event,
2806                                                  reserved_preposts,
2807                                                  priority, &th_spl);
2808         }
2809
2810
2811         if (thread != THREAD_NULL)
2812                 waitq_stats_count_wakeup(waitq);
2813         else
2814                 waitq_stats_count_fail(waitq);
2815
2816         if (lock_state == WAITQ_UNLOCK)
2817                 waitq_unlock(waitq);
2818
2819         if (thread != THREAD_NULL) {
2820                 maybe_adjust_thread_pri(thread, priority);
2821                 kern_return_t ret = thread_go(thread, result);
2822                 assert(ret == KERN_SUCCESS);
2823                 thread_unlock(thread);
2824                 splx(th_spl);
2825                 return ret;
2826         }
2827
2828         return KERN_NOT_WAITING;
2829 }
2830
2831 /**
2832  * wakeup one thread waiting on 'waitq' for 'wake_event'
2833  *
2834  * Conditions:
2835  *      'waitq' is locked
2836  *
2837  * Returns:
2838  *      A locked, runnable thread.
2839  *      If return value is non-NULL, interrupts have also
2840  *      been disabled, and the caller is responsible to call
2841  *      splx() with the returned '*spl' value.
2842  */
2843 thread_t
2844 waitq_wakeup64_identify_locked(struct waitq     *waitq,
2845                                event64_t        wake_event,
2846                                wait_result_t    result,
2847                                spl_t            *spl,
2848                                uint64_t         *reserved_preposts,
2849                                int              priority,
2850                                waitq_lock_state_t lock_state)
2851 {
2852         thread_t thread;
2853
2854         assert(waitq_held(waitq));
2855
2856         if (priority == WAITQ_SELECT_MAX_PRI) {
2857                 thread = waitq_select_max_locked(waitq, wake_event,
2858                                                  reserved_preposts,
2859                                                  spl);
2860         } else {
2861                 thread = waitq_select_one_locked(waitq, wake_event,
2862                                                  reserved_preposts,
2863                                                  priority, spl);
2864         }
2865
2866         if (thread != THREAD_NULL)
2867                 waitq_stats_count_wakeup(waitq);
2868         else
2869                 waitq_stats_count_fail(waitq);
2870
2871         if (lock_state == WAITQ_UNLOCK)
2872                 waitq_unlock(waitq);
2873
2874         if (thread != THREAD_NULL) {
2875                 kern_return_t __assert_only ret;
2876                 ret = thread_go(thread, result);
2877                 assert(ret == KERN_SUCCESS);
2878         }
2879
2880         return thread; /* locked if not NULL (caller responsible for spl) */
2881 }
2882
2883 /**
2884  * wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
2885  *
2886  * Conditions:
2887  *      'waitq' is locked
2888  *      'thread' is unlocked
2889  *
2890  * Notes:
2891  *      May temporarily disable and re-enable interrupts
2892  *
2893  *      If the input lock_state == WAITQ_UNLOCK then the waitq will have been
2894  *      unlocked before calling thread_go() if 'thread' is to be awoken, and
2895  *      is guaranteed to be unlocked upon function return.
2896  */
2897 kern_return_t waitq_wakeup64_thread_locked(struct waitq *waitq,
2898                                            event64_t wake_event,
2899                                            thread_t thread,
2900                                            wait_result_t result,
2901                                            waitq_lock_state_t lock_state)
2902 {
2903         kern_return_t ret;
2904         spl_t th_spl;
2905
2906         assert(waitq_held(waitq));
2907         assert_thread_magic(thread);
2908
2909         /*
2910          * See if the thread was still waiting there.  If so, it got
2911          * dequeued and returned locked.
2912          */
2913         ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
2914
2915         if (ret == KERN_SUCCESS)
2916                 waitq_stats_count_wakeup(waitq);
2917         else
2918                 waitq_stats_count_fail(waitq);
2919
2920         if (lock_state == WAITQ_UNLOCK)
2921                 waitq_unlock(waitq);
2922
2923         if (ret != KERN_SUCCESS)
2924                 return KERN_NOT_WAITING;
2925
2926         ret = thread_go(thread, result);
2927         assert(ret == KERN_SUCCESS);
2928         thread_unlock(thread);
2929         splx(th_spl);
2930
2931         return ret;
2932 }
2933
2934
2935
2936 /* ----------------------------------------------------------------------
2937  *
2938  * In-Kernel API
2939  *
2940  * ---------------------------------------------------------------------- */
2941
2942 /**
2943  * initialize a waitq object
2944  */
2945 kern_return_t waitq_init(struct waitq *waitq, int policy)
2946 {
2947         assert(waitq != NULL);
2948
2949         /* only FIFO and LIFO for now */
2950         if ((policy & SYNC_POLICY_FIXED_PRIORITY) != 0)
2951                 return KERN_INVALID_ARGUMENT;
2952
2953         waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
2954         waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
2955         waitq->waitq_prepost = 0;
2956         waitq->waitq_type = WQT_QUEUE;
2957         waitq->waitq_eventmask = 0;
2958
2959         waitq->waitq_set_id = 0;
2960         waitq->waitq_prepost_id = 0;
2961
2962         waitq_lock_init(waitq);
2963         queue_init(&waitq->waitq_queue);
2964
2965         waitq->waitq_isvalid = 1;
2966         return KERN_SUCCESS;
2967 }
2968
2969 struct wq_unlink_ctx {
2970         struct waitq *unlink_wq;
2971         struct waitq_set *unlink_wqset;
2972 };
2973
2974 static int waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx,
2975                                    struct wq_prepost *wqp, struct waitq *waitq);
2976
2977 /**
2978  * walk_waitq_links callback to invalidate 'link' parameter
2979  *
2980  * Conditions:
2981  *      Called from walk_waitq_links.
2982  *      Note that unlink other callbacks, this one make no assumptions about
2983  *      the 'waitq' parameter, specifically it does not have to be locked or
2984  *      even valid.
2985  */
2986 static int waitq_unlink_all_cb(struct waitq *waitq, void *ctx,
2987                                struct waitq_link *link)
2988 {
2989         (void)waitq;
2990         (void)ctx;
2991         if (wql_type(link) == WQL_LINK && wql_is_valid(link))
2992                 wql_invalidate(link);
2993
2994         if (wql_type(link) == WQL_WQS) {
2995                 struct waitq_set *wqset;
2996                 struct wq_unlink_ctx ulctx;
2997
2998                 /*
2999                  * When destroying the waitq, take the time to clear out any
3000                  * preposts it may have made. This could potentially save time
3001                  * on the IPC send path which would otherwise have to iterate
3002                  * over lots of dead port preposts.
3003                  */
3004                 if (waitq->waitq_prepost_id == 0)
3005                         goto out;
3006
3007                 wqset = link->wql_wqs.wql_set;
3008                 assert(wqset != NULL);
3009                 assert(!waitq_irq_safe(&wqset->wqset_q));
3010
3011                 waitq_set_lock(wqset);
3012
3013                 if (!waitq_set_is_valid(wqset)) {
3014                         /* someone raced us to teardown */
3015                         goto out_unlock;
3016                 }
3017                 if (!waitq_set_maybe_preposted(wqset))
3018                         goto out_unlock;
3019
3020                 ulctx.unlink_wq = waitq;
3021                 ulctx.unlink_wqset = wqset;
3022                 (void)wq_prepost_iterate(wqset->wqset_prepost_id, &ulctx,
3023                                          waitq_unlink_prepost_cb);
3024 out_unlock:
3025                 waitq_set_unlock(wqset);
3026         }
3027
3028 out:
3029         return WQ_ITERATE_CONTINUE;
3030 }
3031
3032
3033 /**
3034  * cleanup any link/prepost table resources associated with a waitq
3035  */
3036 void waitq_deinit(struct waitq *waitq)
3037 {
3038         spl_t s;
3039
3040         if (!waitq || !waitq_is_queue(waitq))
3041                 return;
3042
3043         if (waitq_irq_safe(waitq))
3044                 s = splsched();
3045         waitq_lock(waitq);
3046         if (!waitq_valid(waitq)) {
3047                 waitq_unlock(waitq);
3048                 if (waitq_irq_safe(waitq))
3049                         splx(s);
3050                 return;
3051         }
3052
3053         waitq->waitq_type = WQT_INVALID;
3054         waitq->waitq_isvalid = 0;
3055
3056         if (!waitq_irq_safe(waitq)) {
3057                 waitq_unlink_all_unlock(waitq);
3058                 /* waitq unlocked and set links deallocated */
3059         } else {
3060                 waitq_unlock(waitq);
3061                 splx(s);
3062         }
3063
3064         assert(queue_empty(&waitq->waitq_queue));
3065 }
3066
3067 void waitq_invalidate_locked(struct waitq *waitq)
3068 {
3069         assert(waitq_held(waitq));
3070         assert(waitq_is_valid(waitq));
3071         waitq->waitq_isvalid = 0;
3072 }
3073
3074 /**
3075  * invalidate the given wq_prepost object
3076  *
3077  * Conditions:
3078  *      Called from wq_prepost_iterate (_not_ from wq_prepost_foreach_locked!)
3079  */
3080 static int wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
3081                                         void __unused *ctx,
3082                                         struct wq_prepost *wqp,
3083                                         struct waitq __unused *waitq)
3084 {
3085         if (wqp_type(wqp) == WQP_POST)
3086                 wq_prepost_invalidate(wqp);
3087         return WQ_ITERATE_CONTINUE;
3088 }
3089
3090
3091 /**
3092  * allocate and initialize a waitq set object
3093  *
3094  * Conditions:
3095  *      may block
3096  *
3097  * Returns:
3098  *      allocated / initialized waitq_set object
3099  *      NULL on failure
3100  */
3101 struct waitq_set *waitq_set_alloc(int policy, void *prepost_hook)
3102 {
3103         struct waitq_set *wqset;
3104
3105         wqset = (struct waitq_set *)zalloc(waitq_set_zone);
3106         if (!wqset)
3107                 panic("Can't allocate a new waitq set from zone %p", waitq_set_zone);
3108
3109         kern_return_t ret;
3110         ret = waitq_set_init(wqset, policy, NULL, prepost_hook);
3111         if (ret != KERN_SUCCESS) {
3112                 zfree(waitq_set_zone, wqset);
3113                 wqset = NULL;
3114         }
3115
3116         return wqset;
3117 }
3118
3119 /**
3120  * initialize a waitq set object
3121  *
3122  * Conditions:
3123  *      may (rarely) block if link table needs to grow, and
3124  *      no 'reserved_link' object is passed.
3125  */
3126 kern_return_t waitq_set_init(struct waitq_set *wqset,
3127                              int policy, uint64_t *reserved_link,
3128                              void *prepost_hook)
3129 {
3130         struct waitq_link *link;
3131         kern_return_t ret;
3132
3133         memset(wqset, 0, sizeof(*wqset));
3134
3135         ret = waitq_init(&wqset->wqset_q, policy);
3136         if (ret != KERN_SUCCESS)
3137                 return ret;
3138
3139         wqset->wqset_q.waitq_type = WQT_SET;
3140         if (policy & SYNC_POLICY_PREPOST) {
3141                 wqset->wqset_q.waitq_prepost = 1;
3142                 wqset->wqset_prepost_id = 0;
3143                 assert(prepost_hook == NULL);
3144         } else {
3145                 wqset->wqset_q.waitq_prepost = 0;
3146                 wqset->wqset_prepost_hook = prepost_hook;
3147         }
3148
3149         if (reserved_link && *reserved_link != 0) {
3150                 link = wql_get_reserved(*reserved_link, WQL_WQS);
3151                 /* always consume the caller's reference */
3152                 *reserved_link = 0;
3153         } else {
3154                 link = wql_alloc_link(WQL_WQS);
3155         }
3156         if (!link)
3157                 panic("Can't allocate link object for waitq set: %p", wqset);
3158
3159         link->wql_wqs.wql_set = wqset;
3160         wql_mkvalid(link);
3161
3162         wqset->wqset_id = link->wql_setid.id;
3163         wql_put_link(link);
3164
3165         return KERN_SUCCESS;
3166 }
3167
3168 /**
3169  * clear out / release any resources associated with a waitq set
3170  *
3171  * Conditions:
3172  *      may block
3173  * Note:
3174  *      This will render the waitq set invalid, and it must
3175  *      be re-initialized with waitq_set_init before it can be used again
3176  */
3177 void waitq_set_deinit(struct waitq_set *wqset)
3178 {
3179         struct waitq_link *link = NULL;
3180         uint64_t set_id, prepost_id;
3181
3182         if (!waitqs_is_set(wqset))
3183                 panic("trying to de-initialize an invalid wqset @%p", wqset);
3184
3185         assert(!waitq_irq_safe(&wqset->wqset_q));
3186         waitq_set_lock(wqset);
3187
3188         set_id = wqset->wqset_id;
3189
3190         /* grab the set's link object */
3191         link = wql_get_link(set_id);
3192         if (link)
3193                 wql_invalidate(link);
3194
3195         /* someone raced us to deinit */
3196         if (!link || wqset->wqset_id != set_id || set_id != link->wql_setid.id) {
3197                 if (link)
3198                         wql_put_link(link);
3199                 waitq_set_unlock(wqset);
3200                 return;
3201         }
3202
3203         /* every wait queue set should have a valid link object */
3204         assert(link != NULL && wql_type(link) == WQL_WQS);
3205
3206         wqset->wqset_id = 0;
3207
3208         /*
3209          * This set may have a lot of preposts, or may have been a member of
3210          * many other sets. To minimize spinlock hold times, we clear out the
3211          * waitq set data structure under the lock-hold, but don't clear any
3212          * table objects. We keep handles to the prepost and set linkage
3213          * objects and free those outside the critical section.
3214          */
3215         prepost_id = 0;
3216         if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id)
3217                 prepost_id = wqset->wqset_prepost_id;
3218         /* else { TODO: notify kqueue subsystem? } */
3219         wqset->wqset_prepost_id = 0;
3220
3221         wqset->wqset_q.waitq_type = WQT_INVALID;
3222         wqset->wqset_q.waitq_fifo = 0;
3223         wqset->wqset_q.waitq_prepost = 0;
3224         wqset->wqset_q.waitq_isvalid = 0;
3225
3226         /* don't clear the 'waitq_irq' bit: it's used in locking! */
3227         wqset->wqset_q.waitq_eventmask = 0;
3228
3229         waitq_unlink_all_unlock(&wqset->wqset_q);
3230         /* wqset->wqset_q unlocked and set links deallocated */
3231
3232         /*
3233          * walk_waitq_links may race with us for access to the waitq set.
3234          * If walk_waitq_links has a reference to the set, then we should wait
3235          * until the link's refcount goes to 1 (our reference) before we exit
3236          * this function. That way we ensure that the waitq set memory will
3237          * remain valid even though it's been cleared out.
3238          */
3239         while (wql_refcnt(link) > 1)
3240                 delay(1);
3241         wql_put_link(link);
3242
3243         /* drop / unlink all the prepost table objects */
3244         /* JMM - can this happen before the delay? */
3245         if (prepost_id)
3246                 (void)wq_prepost_iterate(prepost_id, NULL,
3247                                          wqset_clear_prepost_chain_cb);
3248 }
3249
3250 /**
3251  * de-initialize and free an allocated waitq set object
3252  *
3253  * Conditions:
3254  *      may block
3255  */
3256 kern_return_t waitq_set_free(struct waitq_set *wqset)
3257 {
3258         waitq_set_deinit(wqset);
3259
3260         memset(wqset, 0, sizeof(*wqset));
3261         zfree(waitq_set_zone, wqset);
3262
3263         return KERN_SUCCESS;
3264 }
3265
3266 #if DEVELOPMENT || DEBUG
3267 #if CONFIG_WAITQ_DEBUG
3268 /**
3269  * return the set ID of 'wqset'
3270  */
3271 uint64_t wqset_id(struct waitq_set *wqset)
3272 {
3273         if (!wqset)
3274                 return 0;
3275
3276         assert(waitqs_is_set(wqset));
3277         return wqset->wqset_id;
3278 }
3279
3280 /**
3281  * returns a pointer to the waitq object embedded in 'wqset'
3282  */
3283 struct waitq *wqset_waitq(struct waitq_set *wqset)
3284 {
3285         if (!wqset)
3286                 return NULL;
3287
3288         assert(waitqs_is_set(wqset));
3289
3290         return &wqset->wqset_q;
3291 }
3292 #endif /* CONFIG_WAITQ_DEBUG */
3293 #endif /* DEVELOPMENT || DEBUG */
3294
3295
3296 /**
3297  * clear all preposts originating from 'waitq'
3298  *
3299  * Conditions:
3300  *      'waitq' locked
3301  *      may (rarely) spin waiting for another on-core thread to
3302  *      release the last reference to the waitq's prepost link object
3303  *
3304  * NOTE:
3305  *      If this function needs to spin, it will drop the waitq lock!
3306  *      The return value of the function indicates whether or not this
3307  *      happened: 1 == lock was dropped, 0 == lock held
3308  */
3309 int waitq_clear_prepost_locked(struct waitq *waitq)
3310 {
3311         struct wq_prepost *wqp;
3312         int dropped_lock = 0;
3313
3314         assert(!waitq_irq_safe(waitq));
3315
3316         if (waitq->waitq_prepost_id == 0)
3317                 return 0;
3318
3319         wqp = wq_prepost_get(waitq->waitq_prepost_id);
3320         waitq->waitq_prepost_id = 0;
3321         if (wqp) {
3322                 uint64_t wqp_id = wqp->wqp_prepostid.id;
3323                 wqdbg_v("invalidate prepost 0x%llx (refcnt:%d)",
3324                         wqp->wqp_prepostid.id, wqp_refcnt(wqp));
3325                 wq_prepost_invalidate(wqp);
3326                 while (wqp_refcnt(wqp) > 1) {
3327
3328                         /*
3329                          * Some other thread must have raced us to grab a link
3330                          * object reference before we invalidated it. This
3331                          * means that they are probably trying to access the
3332                          * waitq to which the prepost object points. We need
3333                          * to wait here until the other thread drops their
3334                          * reference. We know that no one else can get a
3335                          * reference (the object has been invalidated), and
3336                          * that prepost references are short-lived (dropped on
3337                          * a call to wq_prepost_put). We also know that no one
3338                          * blocks while holding a reference therefore the
3339                          * other reference holder must be on-core. We'll just
3340                          * sit and wait for the other reference to be dropped.
3341                          */
3342                         disable_preemption();
3343
3344                         waitq_unlock(waitq);
3345                         dropped_lock = 1;
3346                         /*
3347                          * don't yield here, just spin and assume the other
3348                          * consumer is already on core...
3349                          */
3350                         delay(1);
3351
3352                         waitq_lock(waitq);
3353
3354                         enable_preemption();
3355                 }
3356                 if (wqp_refcnt(wqp) > 0 && wqp->wqp_prepostid.id == wqp_id)
3357                         wq_prepost_put(wqp);
3358         }
3359
3360         return dropped_lock;
3361 }
3362
3363 /**
3364  * clear all preposts originating from 'waitq'
3365  *
3366  * Conditions:
3367  *      'waitq' is not locked
3368  *      may disable and re-enable interrupts
3369  */
3370 void waitq_clear_prepost(struct waitq *waitq)
3371 {
3372         assert(waitq_valid(waitq));
3373         assert(!waitq_irq_safe(waitq));
3374
3375         waitq_lock(waitq);
3376         /* it doesn't matter to us if the lock is dropped here */
3377         (void)waitq_clear_prepost_locked(waitq);
3378         waitq_unlock(waitq);
3379 }
3380
3381 /**
3382  * return a the waitq's prepost object ID (allocate if necessary)
3383  *
3384  * Conditions:
3385  *      'waitq' is unlocked
3386  */
3387 uint64_t waitq_get_prepost_id(struct waitq *waitq)
3388 {
3389         struct wq_prepost *wqp;
3390         uint64_t wqp_id = 0;
3391
3392         if (!waitq_valid(waitq))
3393                 return 0;
3394
3395         assert(!waitq_irq_safe(waitq));
3396
3397         waitq_lock(waitq);
3398
3399         if (!waitq_valid(waitq))
3400                 goto out_unlock;
3401
3402         if (waitq->waitq_prepost_id) {
3403                 wqp_id = waitq->waitq_prepost_id;
3404                 goto out_unlock;
3405         }
3406
3407         /* don't hold a spinlock while allocating a prepost object */
3408         waitq_unlock(waitq);
3409
3410         wqp = wq_prepost_alloc(WQP_WQ, 1);
3411         if (!wqp)
3412                 return 0;
3413
3414         /* re-acquire the waitq lock */
3415         waitq_lock(waitq);
3416
3417         if (!waitq_valid(waitq)) {
3418                 wq_prepost_put(wqp);
3419                 wqp_id = 0;
3420                 goto out_unlock;
3421         }
3422
3423         if (waitq->waitq_prepost_id) {
3424                 /* we were beat by someone else */
3425                 wq_prepost_put(wqp);
3426                 wqp_id = waitq->waitq_prepost_id;
3427                 goto out_unlock;
3428         }
3429
3430         wqp->wqp_wq.wqp_wq_ptr = waitq;
3431
3432         wqp_set_valid(wqp);
3433         wqp_id = wqp->wqp_prepostid.id;
3434         waitq->waitq_prepost_id = wqp_id;
3435
3436         wq_prepost_put(wqp);
3437
3438 out_unlock:
3439         waitq_unlock(waitq);
3440
3441         return wqp_id;
3442 }
3443
3444
3445 static int waitq_inset_cb(struct waitq *waitq, void *ctx, struct waitq_link *link)
3446 {
3447         uint64_t setid = *(uint64_t *)ctx;
3448         int wqltype = wql_type(link);
3449         (void)waitq;
3450         if (wqltype == WQL_WQS && link->wql_setid.id == setid) {
3451                 wqdbg_v("  waitq already in set 0x%llx", setid);
3452                 return WQ_ITERATE_FOUND;
3453         } else if (wqltype == WQL_LINK) {
3454                 /*
3455                  * break out early if we see a link that points to the setid
3456                  * in question. This saves us a step in the
3457                  * iteration/recursion
3458                  */
3459                 wqdbg_v("  waitq already in set 0x%llx (WQL_LINK)", setid);
3460                 if (link->wql_link.left_setid == setid ||
3461                     link->wql_link.right_setid == setid)
3462                         return WQ_ITERATE_FOUND;
3463         }
3464
3465         return WQ_ITERATE_CONTINUE;
3466 }
3467
3468 /**
3469  * determine if 'waitq' is a member of 'wqset'
3470  *
3471  * Conditions:
3472  *      neither 'waitq' nor 'wqset' is not locked
3473  *      may disable and re-enable interrupts while locking 'waitq'
3474  */
3475 boolean_t waitq_member(struct waitq *waitq, struct waitq_set *wqset)
3476 {
3477         kern_return_t kr = WQ_ITERATE_SUCCESS;
3478         uint64_t setid;
3479
3480         if (!waitq_valid(waitq))
3481                 panic("Invalid waitq: %p", waitq);
3482         assert(!waitq_irq_safe(waitq));
3483
3484         if (!waitqs_is_set(wqset))
3485                 return FALSE;
3486
3487         waitq_lock(waitq);
3488
3489         setid = wqset->wqset_id;
3490         if (!setid)
3491                 goto out_unlock;
3492
3493         /* fast path: most waitqs are members of only 1 set */
3494         if (waitq->waitq_set_id == setid) {
3495                 waitq_unlock(waitq);
3496                 return TRUE;
3497         }
3498
3499         /* walk the link table and look for the Set ID of wqset */
3500         kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
3501                               WQL_ALL, (void *)&setid, waitq_inset_cb);
3502
3503 out_unlock:
3504         waitq_unlock(waitq);
3505         return (kr == WQ_ITERATE_FOUND);
3506 }
3507
3508 /**
3509  * Returns true is the given waitq is a member of at least 1 set
3510  */
3511 boolean_t waitq_in_set(struct waitq *waitq)
3512 {
3513         struct waitq_link *link;
3514         boolean_t inset = FALSE;
3515
3516         if (waitq_irq_safe(waitq))
3517                 return FALSE;
3518
3519         waitq_lock(waitq);
3520
3521         if (!waitq->waitq_set_id)
3522                 goto out_unlock;
3523
3524         link = wql_get_link(waitq->waitq_set_id);
3525         if (link) {
3526                 /* if we get here, the waitq is in _at_least_one_ set */
3527                 inset = TRUE;
3528                 wql_put_link(link);
3529         } else {
3530                 /* we can just optimize this for next time */
3531                 waitq->waitq_set_id = 0;
3532         }
3533
3534 out_unlock:
3535         waitq_unlock(waitq);
3536         return inset;
3537 }
3538
3539
3540 /**
3541  * pre-allocate a waitq link structure from the link table
3542  *
3543  * Conditions:
3544  *      'waitq' is not locked
3545  *      may (rarely) block if link table needs to grow
3546  */
3547 uint64_t waitq_link_reserve(struct waitq *waitq)
3548 {
3549         struct waitq_link *link;
3550         uint64_t reserved_id = 0;
3551
3552         assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));
3553
3554         /*
3555          * We've asserted that the caller can block, so we enforce a
3556          * minimum-free table element policy here.
3557          */
3558         wql_ensure_free_space();
3559
3560         (void)waitq;
3561         link = wql_alloc_link(LT_RESERVED);
3562         if (!link)
3563                 return 0;
3564
3565         reserved_id = link->wql_setid.id;
3566
3567         return reserved_id;
3568 }
3569
3570 /**
3571  * release a pre-allocated waitq link structure
3572  */
3573 void waitq_link_release(uint64_t id)
3574 {
3575         struct waitq_link *link;
3576
3577         if (id == 0)
3578                 return;
3579
3580         link = wql_get_reserved(id, WQL_LINK);
3581         if (!link)
3582                 return;
3583
3584         /*
3585          * if we successfully got a link object, then we know
3586          * it's not been marked valid, and can be released with
3587          * a standard wql_put_link() which should free the element.
3588          */
3589         wql_put_link(link);
3590 #if CONFIG_LTABLE_STATS
3591         g_wqlinktable.nreserved_releases += 1;
3592 #endif
3593 }
3594
3595 /**
3596  * link 'waitq' to the set identified by 'setid' using the 'link' structure
3597  *
3598  * Conditions:
3599  *      'waitq' is locked
3600  *      caller should have a reference to the 'link' object
3601  */
3602 static kern_return_t waitq_link_internal(struct waitq *waitq,
3603                                          uint64_t setid, struct waitq_link *link)
3604 {
3605         struct waitq_link *qlink;
3606         kern_return_t kr;
3607
3608         assert(waitq_held(waitq));
3609
3610         /*
3611          * If the waitq_set_id field is empty, then this waitq is not
3612          * a member of any other set. All we have to do is update the
3613          * field.
3614          */
3615         if (!waitq->waitq_set_id) {
3616                 waitq->waitq_set_id = setid;
3617                 return KERN_SUCCESS;
3618         }
3619
3620         qlink = wql_get_link(waitq->waitq_set_id);
3621         if (!qlink) {
3622                 /*
3623                  * The set to which this wait queue belonged has been
3624                  * destroyed / invalidated. We can re-use the waitq field.
3625                  */
3626                 waitq->waitq_set_id = setid;
3627                 return KERN_SUCCESS;
3628         }
3629         wql_put_link(qlink);
3630
3631         /*
3632          * Check to see if it's already a member of the set.
3633          *
3634          * TODO: check for cycles!
3635          */
3636         kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
3637                               WQL_ALL, (void *)&setid, waitq_inset_cb);
3638         if (kr == WQ_ITERATE_FOUND)
3639                 return kr;
3640
3641         /*
3642          * This wait queue is a member of at least one set already,
3643          * and _not_ a member of the given set. Use our previously
3644          * allocated link object, and hook it up to the wait queue.
3645          * Note that it's possible that one or more of the wait queue sets to
3646          * which the wait queue belongs was invalidated before we allocated
3647          * this link object. That's OK because the next time we use that
3648          * object we'll just ignore it.
3649          */
3650         link->wql_link.left_setid = setid;
3651         link->wql_link.right_setid = waitq->waitq_set_id;
3652         wql_mkvalid(link);
3653
3654         waitq->waitq_set_id = link->wql_setid.id;
3655
3656         return KERN_SUCCESS;
3657 }
3658
3659 /**
3660  * link 'waitq' to 'wqset'
3661  *
3662  * Conditions:
3663  *      if 'lock_state' contains WAITQ_SHOULD_LOCK, 'waitq' must be unlocked.
3664  *      Otherwise, 'waitq' must be locked.
3665  *
3666  *      may (rarely) block on link table allocation if the table has to grow,
3667  *      and no 'reserved_link' object is passed.
3668  *
3669  * Notes:
3670  *      The caller can guarantee that this function will never block by
3671  *      pre-allocating a link table object and passing its ID in 'reserved_link'
3672  */
3673 kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset,
3674                          waitq_lock_state_t lock_state, uint64_t *reserved_link)
3675 {
3676         kern_return_t kr;
3677         struct waitq_link *link;
3678         int should_lock = (lock_state == WAITQ_SHOULD_LOCK);
3679
3680         if (!waitq_valid(waitq) || waitq_irq_safe(waitq))
3681                 panic("Invalid waitq: %p", waitq);
3682
3683         if (!waitqs_is_set(wqset))
3684                 return KERN_INVALID_ARGUMENT;
3685
3686         wqdbg_v("Link waitq %p to wqset 0x%llx",
3687                 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
3688
3689         /*
3690          * We _might_ need a new link object here, so we'll grab outside
3691          * the lock because the alloc call _might_ block.
3692          *
3693          * If the caller reserved a link beforehand, then wql_get_link
3694          * is guaranteed not to block because the caller holds an extra
3695          * reference to the link which, in turn, hold a reference to the
3696          * link table.
3697          */
3698         if (reserved_link && *reserved_link != 0) {
3699                 link = wql_get_reserved(*reserved_link, WQL_LINK);
3700                 /* always consume the caller's reference */
3701                 *reserved_link = 0;
3702         } else {
3703                 link = wql_alloc_link(WQL_LINK);
3704         }
3705         if (!link)
3706                 return KERN_NO_SPACE;
3707
3708         if (should_lock) {
3709                 waitq_lock(waitq);
3710         }
3711
3712         kr = waitq_link_internal(waitq, wqset->wqset_id, link);
3713
3714         if (should_lock) {
3715                 waitq_unlock(waitq);
3716         }
3717
3718         wql_put_link(link);
3719
3720         return kr;
3721 }
3722
3723 /**
3724  * helper: unlink 'waitq' from waitq set identified by 'setid'
3725  *         this function also prunes invalid objects from the tree
3726  *
3727  * Conditions:
3728  *      MUST be called from walk_waitq_links link table walk
3729  *      'waitq' is locked
3730  *
3731  * Notes:
3732  *      This is a helper function which compresses the link table by culling
3733  *      unused or unnecessary links. See comments below for different
3734  *      scenarios.
3735  */
3736 static inline int waitq_maybe_remove_link(struct waitq *waitq,
3737                                           uint64_t setid,
3738                                           struct waitq_link *parent,
3739                                           struct waitq_link *left,
3740                                           struct waitq_link *right)
3741 {
3742         uint64_t *wq_setid = &waitq->waitq_set_id;
3743
3744         /*
3745          * There are two scenarios:
3746          *
3747          * Scenario 1:
3748          * --------------------------------------------------------------------
3749          * waitq->waitq_set_id == parent
3750          *
3751          *         parent(LINK)
3752          *           /    \
3753          *          /      \
3754          *         /        \
3755          *  L(LINK/WQS_l)   R(LINK/WQS_r)
3756          *
3757          * In this scenario, we assert that the original waitq points to the
3758          * parent link we were passed in.  If WQS_l (or WQS_r) is the waitq
3759          * set we're looking for, we can set the corresponding parent
3760          * link id (left or right) to 0.  To compress the tree, we can reset the
3761          * waitq_set_id of the original waitq to point to the side of the
3762          * parent that is still valid. We then discard the parent link object.
3763          */
3764         if (*wq_setid == parent->wql_setid.id) {
3765                 if (!left && !right) {
3766                         /* completely invalid children */
3767                         wql_invalidate(parent);
3768                         wqdbg_v("S1, L+R");
3769                         *wq_setid = 0;
3770                         return WQ_ITERATE_INVALID;
3771                 } else if (!left || left->wql_setid.id == setid) {
3772                         /*
3773                          * left side matches we know it points either to the
3774                          * WQS we're unlinking, or to an invalid object:
3775                          * no need to invalidate it
3776                          */
3777                         *wq_setid = right ? right->wql_setid.id : 0;
3778                         wql_invalidate(parent);
3779                         wqdbg_v("S1, L");
3780                         return left ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
3781                 } else if (!right || right->wql_setid.id == setid) {
3782                         /*
3783                          * if right side matches we know it points either to the
3784                          * WQS we're unlinking, or to an invalid object:
3785                          * no need to invalidate it
3786                          */
3787                         *wq_setid = left ? left->wql_setid.id : 0;
3788                         wql_invalidate(parent);
3789                         wqdbg_v("S1, R");
3790                         return right ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
3791                 }
3792         }
3793
3794         /*
3795          * the tree walk starts at the top-of-tree and moves down,
3796          * so these are safe asserts.
3797          */
3798         assert(left || right); /* one of them has to be valid at this point */
3799
3800         /*
3801          * Scenario 2:
3802          * --------------------------------------------------------------------
3803          * waitq->waitq_set_id == ... (OR parent)
3804          *
3805          *                    ...
3806          *                     |
3807          *                   parent
3808          *                   /    \
3809          *                  /      \
3810          *              L(LINK)     R(LINK)
3811          *               /\             /\
3812          *              /  \           /  \
3813          *             /    \       Rl(*)  Rr(*)
3814          *         Ll(WQS)  Lr(WQS)
3815          *
3816          * In this scenario, a leaf node of either the left or right side
3817          * could be the wait queue set we're looking to unlink. We also handle
3818          * the case where one of these links is invalid.  If a leaf node is
3819          * invalid or it's the set we're looking for, we can safely remove the
3820          * middle link (left or right) and point the parent link directly to
3821          * the remaining leaf node.
3822          */
3823         if (left && wql_type(left) == WQL_LINK) {
3824                 uint64_t Ll, Lr;
3825                 struct waitq_link *linkLl, *linkLr;
3826                 assert(left->wql_setid.id != setid);
3827                 Ll = left->wql_link.left_setid;
3828                 Lr = left->wql_link.right_setid;
3829                 linkLl = wql_get_link(Ll);
3830                 linkLr = wql_get_link(Lr);
3831                 if (!linkLl && !linkLr) {
3832                         /*
3833                          * The left object points to two invalid objects!
3834                          * We can invalidate the left w/o touching the parent.
3835                          */
3836                         wql_invalidate(left);
3837                         wqdbg_v("S2, Ll+Lr");
3838                         return WQ_ITERATE_INVALID;
3839                 } else if (!linkLl || Ll == setid) {
3840                         /* Ll is invalid and/or the wait queue set we're looking for */
3841                         parent->wql_link.left_setid = Lr;
3842                         wql_invalidate(left);
3843                         wql_put_link(linkLl);
3844                         wql_put_link(linkLr);
3845                         wqdbg_v("S2, Ll");
3846                         return linkLl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
3847                 } else if (!linkLr || Lr == setid) {
3848                         /* Lr is invalid and/or the wait queue set we're looking for */
3849                         parent->wql_link.left_setid = Ll;
3850                         wql_invalidate(left);
3851                         wql_put_link(linkLr);
3852                         wql_put_link(linkLl);
3853                         wqdbg_v("S2, Lr");
3854                         return linkLr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
3855                 }
3856                 wql_put_link(linkLl);
3857                 wql_put_link(linkLr);
3858         }
3859
3860         if (right && wql_type(right) == WQL_LINK) {
3861                 uint64_t Rl, Rr;
3862                 struct waitq_link *linkRl, *linkRr;
3863                 assert(right->wql_setid.id != setid);
3864                 Rl = right->wql_link.left_setid;
3865                 Rr = right->wql_link.right_setid;
3866                 linkRl = wql_get_link(Rl);
3867                 linkRr = wql_get_link(Rr);
3868                 if (!linkRl && !linkRr) {
3869                         /*
3870                          * The right object points to two invalid objects!
3871                          * We can invalidate the right w/o touching the parent.
3872                          */
3873                         wql_invalidate(right);
3874                         wqdbg_v("S2, Rl+Rr");
3875                         return WQ_ITERATE_INVALID;
3876                 } else if (!linkRl || Rl == setid) {
3877                         /* Rl is invalid and/or the wait queue set we're looking for */
3878                         parent->wql_link.right_setid = Rr;
3879                         wql_invalidate(right);
3880                         wql_put_link(linkRl);
3881                         wql_put_link(linkRr);
3882                         wqdbg_v("S2, Rl");
3883                         return linkRl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
3884                 } else if (!linkRr || Rr == setid) {
3885                         /* Rr is invalid and/or the wait queue set we're looking for */
3886                         parent->wql_link.right_setid = Rl;
3887                         wql_invalidate(right);
3888                         wql_put_link(linkRl);
3889                         wql_put_link(linkRr);
3890                         wqdbg_v("S2, Rr");
3891                         return linkRr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
3892                 }
3893                 wql_put_link(linkRl);
3894                 wql_put_link(linkRr);
3895         }
3896
3897         return WQ_ITERATE_CONTINUE;
3898 }
3899
3900 /**
3901  * link table walk callback that unlinks 'waitq' from 'ctx->setid'
3902  *
3903  * Conditions:
3904  *      called from walk_waitq_links
3905  *      'waitq' is locked
3906  *
3907  * Notes:
3908  *      uses waitq_maybe_remove_link() to compress the linktable and
3909  *      perform the actual unlinking
3910  */
3911 static int waitq_unlink_cb(struct waitq *waitq, void *ctx,
3912                            struct waitq_link *link)
3913 {
3914         uint64_t setid = *((uint64_t *)ctx);
3915         struct waitq_link *right, *left;
3916         int ret = 0;
3917
3918         if (wql_type(link) != WQL_LINK)
3919                 return WQ_ITERATE_CONTINUE;
3920
3921         do  {
3922                 left  = wql_get_link(link->wql_link.left_setid);
3923                 right = wql_get_link(link->wql_link.right_setid);
3924
3925                 ret = waitq_maybe_remove_link(waitq, setid, link, left, right);
3926
3927                 wql_put_link(left);
3928                 wql_put_link(right);
3929
3930                 if (!wql_is_valid(link))
3931                         return WQ_ITERATE_INVALID;
3932                 /* A ret value of UNLINKED will break us out of table walk */
3933         } while (ret == WQ_ITERATE_INVALID);
3934
3935         return ret;
3936 }
3937
3938
3939 /**
3940  * undo/remove a prepost from 'ctx' (waitq) to 'wqset'
3941  *
3942  * Conditions:
3943  *      Called from wq_prepost_foreach_locked OR wq_prepost_iterate
3944  *      'wqset' may be NULL
3945  *      (ctx)->unlink_wqset is locked
3946  */
3947 static int waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx,
3948                                    struct wq_prepost *wqp, struct waitq *waitq)
3949 {
3950         struct wq_unlink_ctx *ulctx = (struct wq_unlink_ctx *)ctx;
3951
3952         if (waitq != ulctx->unlink_wq)
3953                 return WQ_ITERATE_CONTINUE;
3954
3955         if (wqp_type(wqp) == WQP_WQ &&
3956             wqp->wqp_prepostid.id == ulctx->unlink_wqset->wqset_prepost_id) {
3957                 /* this is the only prepost on this wait queue set */
3958                 wqdbg_v("unlink wqp (WQ) 0x%llx", wqp->wqp_prepostid.id);
3959                 ulctx->unlink_wqset->wqset_prepost_id = 0;
3960                 return WQ_ITERATE_BREAK;
3961         }
3962
3963         assert(wqp_type(wqp) == WQP_POST);
3964
3965         /*
3966          * The prepost object 'wqp' points to a waitq which should no longer
3967          * be preposted to 'ulctx->unlink_wqset'. We can remove the prepost
3968          * object from the list and break out of the iteration. Using the
3969          * context object in this way allows this same callback function to be
3970          * used from both wq_prepost_foreach_locked and wq_prepost_iterate.
3971          */
3972         wq_prepost_remove(ulctx->unlink_wqset, wqp);
3973         return WQ_ITERATE_BREAK;
3974 }
3975
3976 /**
3977  * unlink 'waitq' from 'wqset'
3978  *
3979  * Conditions:
3980  *      'waitq' is locked
3981  *      'wqset' is _not_ locked
3982  *      may (rarely) spin in prepost clear and drop/re-acquire 'waitq' lock
3983  *      (see waitq_clear_prepost_locked)
3984  */
3985 static kern_return_t waitq_unlink_locked(struct waitq *waitq,
3986                                          struct waitq_set *wqset)
3987 {
3988         uint64_t setid;
3989         kern_return_t kr;
3990
3991         assert(!waitq_irq_safe(waitq));
3992
3993         setid = wqset->wqset_id;
3994
3995         if (waitq->waitq_set_id == 0) {
3996                 /*
3997                  * TODO:
3998                  * it doesn't belong to anyone, and it has a prepost object?
3999                  * This is an artifact of not cleaning up after kqueues when
4000                  * they prepost into select sets...
4001                  */
4002                 if (waitq->waitq_prepost_id != 0)
4003                         (void)waitq_clear_prepost_locked(waitq);
4004                 return KERN_NOT_IN_SET;
4005         }
4006
4007         if (waitq->waitq_set_id == setid) {
4008                 waitq->waitq_set_id = 0;
4009                 /*
4010                  * This was the only set to which the waitq belonged: we can
4011                  * safely release the waitq's prepost object. It doesn't
4012                  * matter if this function drops and re-acquires the lock
4013                  * because we're not manipulating waitq state any more.
4014                  */
4015                 (void)waitq_clear_prepost_locked(waitq);
4016                 return KERN_SUCCESS;
4017         }
4018
4019         /*
4020          * The waitq was a member of more that 1 set, so we need to
4021          * handle potentially compressing the link table, and
4022          * adjusting the waitq->waitq_set_id value.
4023          *
4024          * Note: we can't free the waitq's associated prepost object (if any)
4025          *       because it may be in use by the one or more _other_ sets to
4026          *       which this queue belongs.
4027          *
4028          * Note: This function only handles a single level of the queue linkage.
4029          *       Removing a waitq from a set to which it does not directly
4030          *       belong is undefined. For example, if a waitq belonged to set
4031          *       A, and set A belonged to set B. You can't remove the waitq
4032          *       from set B.
4033          */
4034         kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
4035                               WQL_LINK, (void *)&setid, waitq_unlink_cb);
4036
4037         if (kr == WQ_ITERATE_UNLINKED) {
4038                 struct wq_unlink_ctx ulctx;
4039
4040                 kr = KERN_SUCCESS; /* found it and dis-associated it */
4041
4042                 /* don't look for preposts if it's not prepost-enabled */
4043                 if (!wqset->wqset_q.waitq_prepost)
4044                         goto out;
4045
4046                 assert(!waitq_irq_safe(&wqset->wqset_q));
4047
4048                 waitq_set_lock(wqset);
4049                 /*
4050                  * clear out any prepost from waitq into wqset
4051                  * TODO: this could be more efficient than a linear search of
4052                  *       the waitq set's prepost list.
4053                  */
4054                 ulctx.unlink_wq = waitq;
4055                 ulctx.unlink_wqset = wqset;
4056                 (void)wq_prepost_iterate(wqset->wqset_prepost_id, (void *)&ulctx,
4057                                          waitq_unlink_prepost_cb);
4058                 waitq_set_unlock(wqset);
4059         } else {
4060                 kr = KERN_NOT_IN_SET; /* waitq is _not_ associated with wqset */
4061         }
4062
4063 out:
4064         return kr;
4065 }
4066
4067 /**
4068  * unlink 'waitq' from 'wqset'
4069  *
4070  * Conditions:
4071  *      neither 'waitq' nor 'wqset' is locked
4072  *      may disable and re-enable interrupts
4073  *      may (rarely) spin in prepost clear
4074  *      (see waitq_clear_prepost_locked)
4075  */
4076 kern_return_t waitq_unlink(struct waitq *waitq, struct waitq_set *wqset)
4077 {
4078         kern_return_t kr = KERN_SUCCESS;
4079
4080         assert(waitqs_is_set(wqset));
4081
4082         /*
4083          * we allow the waitq to be invalid because the caller may be trying
4084          * to clear out old/dirty state
4085          */
4086         if (!waitq_valid(waitq))
4087                 return KERN_INVALID_ARGUMENT;
4088
4089         wqdbg_v("unlink waitq %p from set 0x%llx",
4090                 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
4091
4092         assert(!waitq_irq_safe(waitq));
4093
4094         waitq_lock(waitq);
4095
4096         kr = waitq_unlink_locked(waitq, wqset);
4097
4098         waitq_unlock(waitq);
4099         return kr;
4100 }
4101
4102 /**
4103  * unlink a waitq from a waitq set, but reference the waitq by its prepost ID
4104  *
4105  * Conditions:
4106  *      'wqset' is unlocked
4107  *      wqp_id may be valid or invalid
4108  */
4109 void waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset)
4110 {
4111         struct wq_prepost *wqp;
4112
4113         disable_preemption();
4114         wqp = wq_prepost_get(wqp_id);
4115         if (wqp) {
4116                 struct waitq *wq;
4117
4118                 wq = wqp->wqp_wq.wqp_wq_ptr;
4119
4120                 /*
4121                  * lock the waitq, then release our prepost ID reference, then
4122                  * unlink the waitq from the wqset: this ensures that we don't
4123                  * hold a prepost ID reference during the unlink, but we also
4124                  * complete the unlink operation atomically to avoid a race
4125                  * with waitq_unlink[_all].
4126                  */
4127                 assert(!waitq_irq_safe(wq));
4128
4129                 waitq_lock(wq);
4130                 wq_prepost_put(wqp);
4131
4132                 if (!waitq_valid(wq)) {
4133                         /* someone already tore down this waitq! */
4134                         waitq_unlock(wq);
4135                         enable_preemption();
4136                         return;
4137                 }
4138
4139                 /* this _may_ drop the wq lock, but that's OK */
4140                 waitq_unlink_locked(wq, wqset);
4141
4142                 waitq_unlock(wq);
4143         }
4144         enable_preemption();
4145         return;
4146 }
4147
4148
4149 /**
4150  * reference and lock a waitq by its prepost ID
4151  *
4152  * Conditions:
4153  *      wqp_id may be valid or invalid
4154  *
4155  * Returns:
4156  *      a locked waitq if wqp_id was valid
4157  *      NULL on failure
4158  */
4159 struct waitq *waitq_lock_by_prepost_id(uint64_t wqp_id)
4160 {
4161         struct waitq *wq = NULL;
4162         struct wq_prepost *wqp;
4163
4164         disable_preemption();
4165         wqp = wq_prepost_get(wqp_id);
4166         if (wqp) {
4167                 wq = wqp->wqp_wq.wqp_wq_ptr;
4168
4169                 assert(!waitq_irq_safe(wq));
4170
4171                 waitq_lock(wq);
4172                 wq_prepost_put(wqp);
4173
4174                 if (!waitq_valid(wq)) {
4175                         /* someone already tore down this waitq! */
4176                         waitq_unlock(wq);
4177                         enable_preemption();
4178                         return NULL;
4179                 }
4180         }
4181         enable_preemption();
4182         return wq;
4183 }
4184
4185
4186 /**
4187  * unlink 'waitq' from all sets to which it belongs
4188  *
4189  * Conditions:
4190  *      'waitq' is locked on entry
4191  *      returns with waitq lock dropped
4192  *
4193  * Notes:
4194  *      may (rarely) spin (see waitq_clear_prepost_locked)
4195  */
4196 kern_return_t waitq_unlink_all_unlock(struct waitq *waitq)
4197 {
4198         uint64_t old_set_id = 0;
4199         wqdbg_v("unlink waitq %p from all sets",
4200                 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq));
4201         assert(!waitq_irq_safe(waitq));
4202
4203         /* it's not a member of any sets */
4204         if (waitq->waitq_set_id == 0) {
4205                 waitq_unlock(waitq);
4206                 return KERN_SUCCESS;
4207         }
4208
4209         old_set_id = waitq->waitq_set_id;
4210         waitq->waitq_set_id = 0;
4211
4212         /*
4213          * invalidate the prepost entry for this waitq.
4214          * This may drop and re-acquire the waitq lock, but that's OK because
4215          * if it was added to another set and preposted to that set in the
4216          * time we drop the lock, the state will remain consistent.
4217          */
4218         (void)waitq_clear_prepost_locked(waitq);
4219
4220         waitq_unlock(waitq);
4221
4222         if (old_set_id) {
4223                 /*
4224                  * Walk the link table and invalidate each LINK object that
4225                  * used to connect this waitq to one or more sets: this works
4226                  * because WQL_LINK objects are private to each wait queue
4227                  */
4228                 (void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, old_set_id,
4229                                        WQL_LINK, NULL, waitq_unlink_all_cb);
4230         }
4231
4232         return KERN_SUCCESS;
4233 }
4234
4235 /**
4236  * unlink 'waitq' from all sets to which it belongs
4237  *
4238  * Conditions:
4239  *      'waitq' is not locked
4240  *      may disable and re-enable interrupts
4241  *      may (rarely) spin
4242  *      (see waitq_unlink_all_locked, waitq_clear_prepost_locked)
4243  */
4244 kern_return_t waitq_unlink_all(struct waitq *waitq)
4245 {
4246         kern_return_t kr = KERN_SUCCESS;
4247
4248         if (!waitq_valid(waitq))
4249                 panic("Invalid waitq: %p", waitq);
4250
4251         assert(!waitq_irq_safe(waitq));
4252         waitq_lock(waitq);
4253         if (!waitq_valid(waitq)) {
4254                 waitq_unlock(waitq);
4255                 return KERN_SUCCESS;
4256         }
4257
4258         kr = waitq_unlink_all_unlock(waitq);
4259         /* waitq unlocked and set links deallocated */
4260
4261         return kr;
4262 }
4263
4264
4265 /**
4266  * unlink all waitqs from 'wqset'
4267  *
4268  * Conditions:
4269  *      'wqset' is locked on entry
4270  *      'wqset' is unlocked on exit and spl is restored
4271  *
4272  * Note:
4273  *      may (rarely) spin/block (see waitq_clear_prepost_locked)
4274  */
4275 kern_return_t waitq_set_unlink_all_unlock(struct waitq_set *wqset)
4276 {
4277         struct waitq_link *link;
4278         uint64_t prepost_id;
4279
4280         wqdbg_v("unlink all queues from set 0x%llx", wqset->wqset_id);
4281
4282         /*
4283          * This operation does not require interaction with any of the set's
4284          * constituent wait queues. All we have to do is invalidate the SetID
4285          */
4286
4287         /* invalidate and re-alloc the link object first */
4288         link = wql_get_link(wqset->wqset_id);
4289
4290         /* we may have raced with a waitq_set_deinit: handle this */
4291         if (!link) {
4292                 waitq_set_unlock(wqset);
4293                 return KERN_SUCCESS;
4294         }
4295
4296         wql_invalidate(link);
4297
4298         /* re-alloc the object to get a new generation ID */
4299         wql_realloc_link(link, WQL_WQS);
4300         link->wql_wqs.wql_set = wqset;
4301
4302         wqset->wqset_id = link->wql_setid.id;
4303         wql_mkvalid(link);
4304         wql_put_link(link);
4305
4306         /* clear any preposts attached to this set */
4307         prepost_id = 0;
4308         if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id)
4309                 prepost_id = wqset->wqset_prepost_id;
4310         /* else { TODO: notify kqueue subsystem? } */
4311         wqset->wqset_prepost_id = 0;
4312
4313         /*
4314          * clear set linkage and prepost object associated with this set:
4315          * waitq sets may prepost to other sets if, for example, they are
4316          * associated with a kqueue which is in a select set.
4317          *
4318          * This releases all the set link objects
4319          * (links to other sets to which this set was previously added)
4320          */
4321         waitq_unlink_all_unlock(&wqset->wqset_q);
4322         /* wqset->wqset_q unlocked */
4323
4324         /* drop / unlink all the prepost table objects */
4325         if (prepost_id)
4326                 (void)wq_prepost_iterate(prepost_id, NULL,
4327                                          wqset_clear_prepost_chain_cb);
4328
4329         return KERN_SUCCESS;
4330 }
4331
4332 /**
4333  * unlink all waitqs from 'wqset'
4334  *
4335  * Conditions:
4336  *      'wqset' is not locked
4337  *      may (rarely) spin/block (see waitq_clear_prepost_locked)
4338  */
4339 kern_return_t waitq_set_unlink_all(struct waitq_set *wqset)
4340 {
4341         assert(waitqs_is_set(wqset));
4342         assert(!waitq_irq_safe(&wqset->wqset_q));
4343
4344         waitq_set_lock(wqset);
4345         return waitq_set_unlink_all_unlock(wqset);
4346         /* wqset unlocked and set links and preposts deallocated */
4347 }
4348
4349 static int waitq_prepost_reserve_cb(struct waitq *waitq, void *ctx,
4350                                     struct waitq_link *link)
4351 {
4352         uint32_t *num = (uint32_t *)ctx;
4353         (void)waitq;
4354
4355         /*
4356          * In the worst case, we'll have to allocate 2 prepost objects
4357          * per waitq set (if the set was already preposted by another
4358          * waitq).
4359          */
4360         if (wql_type(link) == WQL_WQS) {
4361                 /*
4362                  * check to see if the associated waitq actually supports
4363                  * preposting
4364                  */
4365                 if (waitq_set_can_prepost(link->wql_wqs.wql_set))
4366                         *num += 2;
4367         }
4368         return WQ_ITERATE_CONTINUE;
4369 }
4370
4371 static int waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq,
4372                                            int *did_unlock, struct wq_prepost **wqp)
4373 {
4374         struct wq_prepost *tmp;
4375         struct wqp_cache *cache;
4376
4377         *did_unlock = 0;
4378
4379         /*
4380          * Before we unlock the waitq, check the per-processor prepost object
4381          * cache to see if there's enough there for us. If so, do the
4382          * allocation, keep the lock and save an entire iteration over the set
4383          * linkage!
4384          */
4385         if (waitq) {
4386                 disable_preemption();
4387                 cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
4388                 if (nalloc <= (int)cache->avail)
4389                         goto do_alloc;
4390                 enable_preemption();
4391
4392                 /* unlock the waitq to perform the allocation */
4393                 *did_unlock = 1;
4394                 waitq_unlock(waitq);
4395         }
4396
4397 do_alloc:
4398         tmp = wq_prepost_alloc(LT_RESERVED, nalloc);
4399         if (!tmp)
4400                 panic("Couldn't reserve %d preposts for waitq @%p (wqp@%p)",
4401                       nalloc, waitq, *wqp);
4402         if (*wqp) {
4403                 /* link the two lists */
4404                 int __assert_only rc;
4405                 rc = wq_prepost_rlink(tmp, *wqp);
4406                 assert(rc == nalloc);
4407         }
4408         *wqp = tmp;
4409
4410         /*
4411          * If the caller can block, then enforce a minimum-free table element
4412          * policy here. This helps ensure that we will have enough prepost
4413          * objects for callers such as selwakeup() that can be called with
4414          * spin locks held.
4415          */
4416         if (get_preemption_level() == 0)
4417                 wq_prepost_ensure_free_space();
4418
4419         if (waitq) {
4420                 if (*did_unlock == 0) {
4421                         /* decrement the preemption count if alloc from cache */
4422                         enable_preemption();
4423                 } else {
4424                         /* otherwise: re-lock the waitq */
4425                         waitq_lock(waitq);
4426                 }
4427         }
4428
4429         return nalloc;
4430 }
4431
4432 static int waitq_count_prepost_reservation(struct waitq *waitq, int extra, int keep_locked)
4433 {
4434         int npreposts = 0;
4435
4436         /*
4437          * If the waitq is not currently part of a set, and we're not asked to
4438          * keep the waitq locked then we'll want to have 3 in reserve
4439          * just-in-case it becomes part of a set while we unlock and reserve.
4440          * We may need up to 1 object for the waitq, and 2 for the set.
4441          */
4442         if (waitq->waitq_set_id == 0) {
4443                 npreposts = 3;
4444         } else {
4445                 /* this queue has never been preposted before */
4446                 if (waitq->waitq_prepost_id == 0)
4447                         npreposts = 3;
4448
4449                 /*
4450                  * Walk the set of table linkages associated with this waitq
4451                  * and count the worst-case number of prepost objects that
4452                  * may be needed during a wakeup_all. We can walk this without
4453                  * locking each set along the way because the table-based IDs
4454                  * disconnect us from the set pointers themselves, and the
4455                  * table walking is careful to read the setid values only once.
4456                  * Locking each set up the chain also doesn't guarantee that
4457                  * their membership won't change between the time we unlock
4458                  * that set and when we actually go to prepost, so our
4459                  * situation is no worse than before and we've alleviated lock
4460                  * contention on any sets to which this waitq belongs.
4461                  */
4462                 (void)walk_waitq_links(LINK_WALK_FULL_DAG_UNLOCKED,
4463                                        waitq, waitq->waitq_set_id,
4464                                        WQL_WQS, (void *)&npreposts,
4465                                        waitq_prepost_reserve_cb);
4466         }
4467
4468         if (extra > 0)
4469                 npreposts += extra;
4470
4471         if (npreposts == 0 && !keep_locked) {
4472                 /*
4473                  * If we get here, we were asked to reserve some prepost
4474                  * objects for a waitq that's previously preposted, and is not
4475                  * currently a member of any sets. We have also been
4476                  * instructed to unlock the waitq when we're done. In this
4477                  * case, we pre-allocated enough reserved objects to handle
4478                  * the case where the waitq gets added to a single set when
4479                  * the lock is released.
4480                  */
4481                 npreposts = 3;
4482         }
4483
4484         return npreposts;
4485 }
4486
4487
4488 /**
4489  * pre-allocate prepost objects for 'waitq'
4490  *
4491  * Conditions:
4492  *      'waitq' is not locked
4493  *
4494  * Returns:
4495  *      panic on error
4496  *
4497  *      0 on success, '*reserved' is set to the head of a singly-linked
4498  *      list of pre-allocated prepost objects.
4499  *
4500  * Notes:
4501  *      If 'lock_state' is WAITQ_KEEP_LOCKED, this function performs the pre-allocation
4502  *      atomically and returns 'waitq' locked.
4503  *
4504  *      This function attempts to pre-allocate precisely enough prepost
4505  *      objects based on the current set membership of 'waitq'. If the
4506  *      operation is performed atomically, then the caller
4507  *      is guaranteed to have enough pre-allocated prepost object to avoid
4508  *      any (rare) blocking in the wakeup path.
4509  */
4510 uint64_t waitq_prepost_reserve(struct waitq *waitq, int extra,
4511                                waitq_lock_state_t lock_state)
4512 {
4513         uint64_t reserved = 0;
4514         uint64_t prev_setid = 0, prev_prepostid = 0;
4515         struct wq_prepost *wqp = NULL;
4516         int nalloc = 0, npreposts = 0;
4517         int keep_locked = (lock_state == WAITQ_KEEP_LOCKED);
4518         int unlocked = 0;
4519
4520         wqdbg_v("Attempting to reserve prepost linkages for waitq %p (extra:%d)",
4521                 (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), extra);
4522
4523         if (waitq == NULL && extra > 0) {
4524                 /*
4525                  * Simple prepost object allocation:
4526                  * we'll add 2 more because the waitq might need an object,
4527                  * and the set itself may need a new POST object in addition
4528                  * to the number of preposts requested by the caller
4529                  */
4530                 nalloc = waitq_alloc_prepost_reservation(extra + 2, NULL,
4531                                                          &unlocked, &wqp);
4532                 assert(nalloc == extra + 2);
4533                 return wqp->wqp_prepostid.id;
4534         }
4535
4536         assert(lock_state == WAITQ_KEEP_LOCKED || lock_state == WAITQ_UNLOCK);
4537
4538         assert(!waitq_irq_safe(waitq));
4539
4540         waitq_lock(waitq);
4541
4542         /* remember the set ID that we started with */
4543         prev_setid = waitq->waitq_set_id;
4544         prev_prepostid = waitq->waitq_prepost_id;
4545
4546         /*
4547          * If the waitq is not part of a set, and we're asked to
4548          * keep the set locked, then we don't have to reserve
4549          * anything!
4550          */
4551         if (prev_setid == 0 && keep_locked)
4552                 goto out;
4553
4554         npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
4555
4556         /* nothing for us to do! */
4557         if (npreposts == 0) {
4558                 if (keep_locked)
4559                         goto out;
4560                 goto out_unlock;
4561         }
4562
4563 try_alloc:
4564         /* this _may_ unlock and relock the waitq! */
4565         nalloc = waitq_alloc_prepost_reservation(npreposts, waitq,
4566                                                  &unlocked, &wqp);
4567
4568         if (!unlocked) {
4569                 /* allocation held the waitq lock: we'd done! */
4570                 if (keep_locked)
4571                         goto out;
4572                 goto out_unlock;
4573         }
4574
4575         /*
4576          * Before we return, if the allocation had to unlock the waitq, we
4577          * must check one more time to see if we have enough. If not, we'll
4578          * try to allocate the difference. If the caller requests it, we'll
4579          * also leave the waitq locked so that the use of the pre-allocated
4580          * prepost objects can be guaranteed to be enough if a wakeup_all is
4581          * performed before unlocking the waitq.
4582          */
4583
4584         /*
4585          * If the waitq is no longer associated with a set, or if the waitq's
4586          * set/prepostid has not changed since we first walked its linkage,
4587          * we're done.
4588          */
4589         if ((waitq->waitq_set_id == 0) ||
4590             (waitq->waitq_set_id == prev_setid &&
4591              waitq->waitq_prepost_id == prev_prepostid)) {
4592                 if (keep_locked)
4593                         goto out;
4594                 goto out_unlock;
4595         }
4596
4597         npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
4598
4599         if (npreposts > nalloc) {
4600                 prev_setid = waitq->waitq_set_id;
4601                 prev_prepostid = waitq->waitq_prepost_id;
4602                 npreposts = npreposts - nalloc; /* only allocate the diff */
4603                 goto try_alloc;
4604         }
4605
4606         if (keep_locked)
4607                 goto out;
4608
4609 out_unlock:
4610         waitq_unlock(waitq);
4611 out:
4612         if (wqp)
4613                 reserved = wqp->wqp_prepostid.id;
4614
4615         return reserved;
4616 }
4617
4618 /**
4619  * release a linked list of prepost objects allocated via _prepost_reserve
4620  *
4621  * Conditions:
4622  *      may (rarely) spin waiting for prepost table growth memcpy
4623  */
4624 void waitq_prepost_release_reserve(uint64_t id)
4625 {
4626         struct wq_prepost *wqp;
4627
4628         wqdbg_v("releasing reserved preposts starting at: 0x%llx", id);
4629
4630         wqp = wq_prepost_rfirst(id);
4631         if (!wqp)
4632                 return;
4633
4634         wq_prepost_release_rlist(wqp);
4635 }
4636
4637
4638 /**
4639  * clear all preposts from 'wqset'
4640  *
4641  * Conditions:
4642  *      'wqset' is not locked
4643  */
4644 void waitq_set_clear_preposts(struct waitq_set *wqset)
4645 {
4646         uint64_t prepost_id;
4647         spl_t spl;
4648
4649         assert(waitqs_is_set(wqset));
4650
4651         if (!wqset->wqset_q.waitq_prepost || !wqset->wqset_prepost_id)
4652                 return;
4653
4654         wqdbg_v("Clearing all preposted queues on waitq_set: 0x%llx",
4655                 wqset->wqset_id);
4656
4657         if (waitq_irq_safe(&wqset->wqset_q))
4658                 spl = splsched();
4659         waitq_set_lock(wqset);
4660         prepost_id = wqset->wqset_prepost_id;
4661         wqset->wqset_prepost_id = 0;
4662         waitq_set_unlock(wqset);
4663         if (waitq_irq_safe(&wqset->wqset_q))
4664                 splx(spl);
4665
4666         /* drop / unlink all the prepost table objects */
4667         if (prepost_id)
4668                 (void)wq_prepost_iterate(prepost_id, NULL,
4669                                          wqset_clear_prepost_chain_cb);
4670 }
4671
4672
4673 /* ----------------------------------------------------------------------
4674  *
4675  * Iteration: waitq -> sets / waitq_set -> preposts
4676  *
4677  * ---------------------------------------------------------------------- */
4678
4679 struct wq_it_ctx {
4680         void *input;
4681         void *ctx;
4682         waitq_iterator_t it;
4683 };
4684
4685 static int waitq_iterate_sets_cb(struct waitq *waitq, void *ctx,
4686                                  struct waitq_link *link)
4687 {
4688         struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx);
4689         struct waitq_set *wqset;
4690         int ret;
4691
4692         (void)waitq;
4693         assert(!waitq_irq_safe(waitq));
4694         assert(wql_type(link) == WQL_WQS);
4695
4696         /*
4697          * the waitq is locked, so we can just take the set lock
4698          * and call the iterator function
4699          */
4700         wqset = link->wql_wqs.wql_set;
4701         assert(wqset != NULL);
4702         assert(!waitq_irq_safe(&wqset->wqset_q));
4703         waitq_set_lock(wqset);
4704
4705         ret = wctx->it(wctx->ctx, (struct waitq *)wctx->input, wqset);
4706
4707         waitq_set_unlock(wqset);
4708         return ret;
4709 }
4710
4711 /**
4712  * call external iterator function for each prepost object in wqset
4713  *
4714  * Conditions:
4715  *      Called from wq_prepost_foreach_locked
4716  *      (wqset locked, waitq _not_ locked)
4717  */
4718 static int wqset_iterate_prepost_cb(struct waitq_set *wqset, void *ctx,
4719                                     struct wq_prepost *wqp, struct waitq *waitq)
4720 {
4721         struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx);
4722         uint64_t wqp_id;
4723         int ret;
4724
4725         (void)wqp;
4726
4727         /*
4728          * This is a bit tricky. The 'wqset' is locked, but the 'waitq' is not.
4729          * Taking the 'waitq' lock is a lock order violation, so we need to be
4730          * careful. We also must realize that we may have taken a reference to
4731          * the 'wqp' just as the associated waitq was being torn down (or
4732          * clearing all its preposts) - see waitq_clear_prepost_locked(). If
4733          * the 'wqp' is valid and we can get the waitq lock, then we are good
4734          * to go. If not, we need to back off, check that the 'wqp' hasn't
4735          * been invalidated, and try to re-take the locks.
4736          */
4737         assert(!waitq_irq_safe(waitq));
4738
4739         if (waitq_lock_try(waitq))
4740                 goto call_iterator;
4741
4742         if (!wqp_is_valid(wqp))
4743                 return WQ_ITERATE_RESTART;
4744
4745         /* We are passed a prepost object with a reference on it. If neither
4746          * the waitq set nor the waitq require interrupts disabled, then we
4747          * may block on the delay(1) call below. We can't hold a prepost
4748          * object reference while blocking, so we have to give that up as well
4749          * and re-acquire it when we come back.
4750          */
4751         wqp_id = wqp->wqp_prepostid.id;
4752         wq_prepost_put(wqp);
4753         waitq_set_unlock(wqset);
4754         wqdbg_v("dropped set:%p lock waiting for wqp:%p (0x%llx -> wq:%p)",
4755                 wqset, wqp, wqp->wqp_prepostid.id, waitq);
4756         delay(1);
4757         waitq_set_lock(wqset);
4758         wqp = wq_prepost_get(wqp_id);
4759         if (!wqp)
4760                 /* someone cleared preposts while we slept! */
4761                 return WQ_ITERATE_DROPPED;
4762
4763         /*
4764          * TODO:
4765          * This differs slightly from the logic in ipc_mqueue.c:
4766          * ipc_mqueue_receive_on_thread(). There, if the waitq lock
4767          * can't be obtained, the prepost link is placed on the back of
4768          * the chain, and the iteration starts from the beginning. Here,
4769          * we just restart from the beginning.
4770          */
4771         return WQ_ITERATE_RESTART;
4772
4773 call_iterator:
4774         if (!wqp_is_valid(wqp)) {
4775                 ret = WQ_ITERATE_RESTART;
4776                 goto out_unlock;
4777         }
4778
4779         /* call the external callback */
4780         ret = wctx->it(wctx->ctx, waitq, wqset);
4781
4782         if (ret == WQ_ITERATE_BREAK_KEEP_LOCKED) {
4783                 ret = WQ_ITERATE_BREAK;
4784                 goto out;
4785         }
4786
4787 out_unlock:
4788         waitq_unlock(waitq);
4789 out:
4790         return ret;
4791 }
4792
4793 /**
4794  * iterator over all sets to which the given waitq has been linked
4795  *
4796  * Conditions:
4797  *      'waitq' is locked
4798  */
4799 int waitq_iterate_sets(struct waitq *waitq, void *ctx, waitq_iterator_t it)
4800 {
4801         int ret;
4802         struct wq_it_ctx wctx = {
4803                 .input = (void *)waitq,
4804                 .ctx = ctx,
4805                 .it = it,
4806         };
4807         if (!it || !waitq)
4808                 return KERN_INVALID_ARGUMENT;
4809
4810         ret = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
4811                                WQL_WQS, (void *)&wctx, waitq_iterate_sets_cb);
4812         if (ret == WQ_ITERATE_CONTINUE)
4813                 ret = WQ_ITERATE_SUCCESS;
4814         return ret;
4815 }
4816
4817 /**
4818  * iterator over all preposts in the given wqset
4819  *
4820  * Conditions:
4821  *      'wqset' is locked
4822  */
4823 int waitq_set_iterate_preposts(struct waitq_set *wqset,
4824                                void *ctx, waitq_iterator_t it)
4825 {
4826         struct wq_it_ctx wctx = {
4827                 .input = (void *)wqset,
4828                 .ctx = ctx,
4829                 .it = it,
4830         };
4831         if (!it || !wqset)
4832                 return WQ_ITERATE_INVALID;
4833
4834         assert(waitq_held(&wqset->wqset_q));
4835
4836         return wq_prepost_foreach_locked(wqset, (void *)&wctx,
4837                                          wqset_iterate_prepost_cb);
4838 }
4839
4840
4841 /* ----------------------------------------------------------------------
4842  *
4843  * Higher-level APIs
4844  *
4845  * ---------------------------------------------------------------------- */
4846
4847
4848 /**
4849  * declare a thread's intent to wait on 'waitq' for 'wait_event'
4850  *
4851  * Conditions:
4852  *      'waitq' is not locked
4853  */
4854 wait_result_t waitq_assert_wait64(struct waitq *waitq,
4855                                   event64_t wait_event,
4856                                   wait_interrupt_t interruptible,
4857                   uint64_t deadline)
4858 {
4859         thread_t thread = current_thread();
4860         wait_result_t ret;
4861         spl_t s;
4862
4863         if (!waitq_valid(waitq))
4864                 panic("Invalid waitq: %p", waitq);
4865
4866         if (waitq_irq_safe(waitq))
4867                 s = splsched();
4868
4869         waitq_lock(waitq);
4870         ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
4871                                          TIMEOUT_URGENCY_SYS_NORMAL,
4872                                          deadline, TIMEOUT_NO_LEEWAY, thread);
4873         waitq_unlock(waitq);
4874
4875         if (waitq_irq_safe(waitq))
4876                 splx(s);
4877
4878         return ret;
4879 }
4880
4881 /**
4882  * declare a thread's intent to wait on 'waitq' for 'wait_event'
4883  *
4884  * Conditions:
4885  *      'waitq' is not locked
4886  *      will disable and re-enable interrupts while locking current_thread()
4887  */
4888 wait_result_t waitq_assert_wait64_leeway(struct waitq *waitq,
4889                                          event64_t wait_event,
4890                                          wait_interrupt_t interruptible,
4891                                          wait_timeout_urgency_t urgency,
4892                                          uint64_t deadline,
4893                                          uint64_t leeway)
4894 {
4895         wait_result_t ret;
4896         thread_t thread = current_thread();
4897         spl_t s;
4898
4899         if (!waitq_valid(waitq))
4900                 panic("Invalid waitq: %p", waitq);
4901
4902         if (waitq_irq_safe(waitq))
4903                 s = splsched();
4904
4905         waitq_lock(waitq);
4906         ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
4907                                          urgency, deadline, leeway, thread);
4908         waitq_unlock(waitq);
4909
4910         if (waitq_irq_safe(waitq))
4911                 splx(s);
4912
4913         return ret;
4914 }
4915
4916 /**
4917  * wakeup a single thread from a waitq that's waiting for a given event
4918  *
4919  * Conditions:
4920  *      'waitq' is not locked
4921  *      may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
4922  *      may disable and re-enable interrupts
4923  *
4924  * Notes:
4925  *      will _not_ block if waitq is global (or not a member of any set)
4926  */
4927 kern_return_t waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,
4928                                  wait_result_t result, int priority)
4929 {
4930         kern_return_t kr;
4931         uint64_t reserved_preposts = 0;
4932         spl_t spl;
4933
4934         if (!waitq_valid(waitq))
4935                 panic("Invalid waitq: %p", waitq);
4936
4937         if (!waitq_irq_safe(waitq)) {
4938                 /* reserve preposts in addition to locking the waitq */
4939                 reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);
4940         } else {
4941                 spl = splsched();
4942                 waitq_lock(waitq);
4943         }
4944
4945         /* waitq is locked upon return */
4946         kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
4947                                        &reserved_preposts, priority, WAITQ_UNLOCK);
4948
4949         if (waitq_irq_safe(waitq))
4950                 splx(spl);
4951
4952         /* release any left-over prepost object (won't block/lock anything) */
4953         waitq_prepost_release_reserve(reserved_preposts);
4954
4955         return kr;
4956 }
4957
4958 /**
4959  * wakeup all threads from a waitq that are waiting for a given event
4960  *
4961  * Conditions:
4962  *      'waitq' is not locked
4963  *      may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
4964  *      may disable and re-enable interrupts
4965  *
4966  * Notes:
4967  *      will _not_ block if waitq is global (or not a member of any set)
4968  */
4969 kern_return_t waitq_wakeup64_all(struct waitq *waitq,
4970                                  event64_t wake_event,
4971                                  wait_result_t result,
4972                                  int priority)
4973 {
4974         kern_return_t ret;
4975         uint64_t reserved_preposts = 0;
4976         spl_t s;
4977
4978         if (!waitq_valid(waitq))
4979                 panic("Invalid waitq: %p", waitq);
4980
4981         if (!waitq_irq_safe(waitq)) {
4982                 /* reserve preposts in addition to locking waitq */
4983                 reserved_preposts = waitq_prepost_reserve(waitq, 0,
4984                                                           WAITQ_KEEP_LOCKED);
4985         } else {
4986                 s = splsched();
4987                 waitq_lock(waitq);
4988         }
4989
4990         ret = waitq_wakeup64_all_locked(waitq, wake_event, result,
4991                                         &reserved_preposts, priority,
4992                                         WAITQ_UNLOCK);
4993
4994         if (waitq_irq_safe(waitq))
4995                 splx(s);
4996
4997         waitq_prepost_release_reserve(reserved_preposts);
4998
4999         return ret;
5000
5001 }
5002
5003 /**
5004  * wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
5005  *
5006  * Conditions:
5007  *      'waitq' is not locked
5008  *
5009  * Notes:
5010  *      May temporarily disable and re-enable interrupts
5011  */
5012 kern_return_t waitq_wakeup64_thread(struct waitq *waitq,
5013                                     event64_t wake_event,
5014                                     thread_t thread,
5015                                     wait_result_t result)
5016 {
5017         kern_return_t ret;
5018         spl_t s, th_spl;
5019
5020         if (!waitq_valid(waitq))
5021                 panic("Invalid waitq: %p", waitq);
5022
5023         if (waitq_irq_safe(waitq))
5024                 s = splsched();
5025         waitq_lock(waitq);
5026
5027         ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
5028         /* on success, returns 'thread' locked */
5029
5030         waitq_unlock(waitq);
5031
5032         if (ret == KERN_SUCCESS) {
5033                 ret = thread_go(thread, result);
5034                 assert(ret == KERN_SUCCESS);
5035                 thread_unlock(thread);
5036                 splx(th_spl);
5037                 waitq_stats_count_wakeup(waitq);
5038         } else {
5039                 ret = KERN_NOT_WAITING;
5040                 waitq_stats_count_fail(waitq);
5041         }
5042
5043         if (waitq_irq_safe(waitq))
5044                 splx(s);
5045
5046         return ret;
5047 }
5048
5049 /**
5050  * wakeup a single thread from a waitq that's waiting for a given event
5051  * and return a reference to that thread
5052  * returns THREAD_NULL if no thread was waiting
5053  *
5054  * Conditions:
5055  *      'waitq' is not locked
5056  *      may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
5057  *      may disable and re-enable interrupts
5058  *
5059  * Notes:
5060  *      will _not_ block if waitq is global (or not a member of any set)
5061  */
5062 thread_t
5063 waitq_wakeup64_identify(struct waitq    *waitq,
5064                         event64_t       wake_event,
5065                         wait_result_t   result,
5066                         int             priority)
5067 {
5068         uint64_t reserved_preposts = 0;
5069         spl_t thread_spl = 0;
5070         thread_t thread;
5071         spl_t spl;
5072
5073         if (!waitq_valid(waitq))
5074                 panic("Invalid waitq: %p", waitq);
5075
5076         if (!waitq_irq_safe(waitq)) {
5077                 /* reserve preposts in addition to locking waitq */
5078                 reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);
5079         } else {
5080                 spl = splsched();
5081                 waitq_lock(waitq);
5082         }
5083
5084         thread = waitq_wakeup64_identify_locked(waitq, wake_event, result,
5085                                                 &thread_spl, &reserved_preposts,
5086                                                 priority, WAITQ_UNLOCK);
5087         /* waitq is unlocked, thread is locked */
5088
5089         if (thread != THREAD_NULL) {
5090                 thread_reference(thread);
5091                 thread_unlock(thread);
5092                 splx(thread_spl);
5093         }
5094
5095         if (waitq_irq_safe(waitq))
5096                         splx(spl);
5097
5098         /* release any left-over prepost object (won't block/lock anything) */
5099         waitq_prepost_release_reserve(reserved_preposts);
5100
5101         /* returns +1 ref to running thread or THREAD_NULL */
5102         return thread;
5103 }
5104