git.saurik.com Git - apple/xnu.git/blob

1 /*

3 *

4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@

5 *

6 * This file contains Original Code and/or Modifications of Original Code

7 * as defined in and that are subject to the Apple Public Source License

8 * Version 2.0 (the 'License'). You may not use this file except in

9 * compliance with the License. The rights granted to you under the License

10 * may not be used to create, or enable the creation or redistribution of,

11 * unlawful or unlicensed copies of an Apple operating system, or to

12 * circumvent, violate, or enable the circumvention or violation of, any

13 * terms of an Apple operating system software license agreement.

14 *

15 * Please obtain a copy of the License at

16 * http://www.opensource.apple.com/apsl/ and read it before using this file.

17 *

18 * The Original Code and all software distributed under the License are

19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER

20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,

21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,

22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.

23 * Please see the License for the specific language governing rights and

24 * limitations under the License.

25 *

26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@

27 */

28 /*

29 * @OSF_FREE_COPYRIGHT@

30 */

31 /*

32 * Mach Operating System

35 *

36 * Permission to use, copy, modify and distribute this software and its

37 * documentation is hereby granted, provided that both the copyright

38 * notice and this permission notice appear in all copies of the

39 * software, derivative works or modified versions, and any portions

40 * thereof, and that both notices appear in supporting documentation.

41 *

42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"

43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR

44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.

45 *

46 * Carnegie Mellon requests users of this software to return to

47 *

48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU

49 * School of Computer Science

50 * Carnegie Mellon University

51 * Pittsburgh PA 15213-3890

52 *

53 * any improvements or extensions that they make and grant Carnegie Mellon

54 * the rights to redistribute these changes.

55 */

57 /*

58 * un-comment the following lines to debug the link/prepost tables

59 * NOTE: this expands each element by ~40 bytes

60 */

61 //#define KEEP_WAITQ_LINK_STATS

62 //#define KEEP_WAITQ_PREPOST_STATS

64 #include <kern/ast.h>

65 #include <kern/backtrace.h>

66 #include <kern/kern_types.h>

67 #include <kern/ltable.h>

68 #include <kern/mach_param.h>

69 #include <kern/percpu.h>

70 #include <kern/queue.h>

71 #include <kern/sched_prim.h>

72 #include <kern/simple_lock.h>

73 #include <kern/spl.h>

74 #include <kern/waitq.h>

75 #include <kern/zalloc.h>

76 #include <kern/policy_internal.h>

77 #include <kern/turnstile.h>

79 #include <os/hash.h>

80 #include <libkern/OSAtomic.h>

81 #include <mach/sync_policy.h>

82 #include <vm/vm_kern.h>

84 #include <sys/kdebug.h>

86 #if defined(KEEP_WAITQ_LINK_STATS) || defined(KEEP_WAITQ_PREPOST_STATS)

87 # if !CONFIG_LTABLE_STATS

88 # error "You must configure LTABLE_STATS to use WAITQ_[LINK|PREPOST]_STATS"

89 # endif

90 # if !CONFIG_WAITQ_STATS

91 # error "You must configure WAITQ_STATS to use WAITQ_[LINK|PREPOST]_STATS"

92 # endif

93 #endif

95 #if CONFIG_WAITQ_DEBUG

96 #define wqdbg(fmt, ...) \

         printf("WQ[%s]:  " fmt "\n", __func__, ## __VA_ARGS__)

98 #else

99 #define wqdbg(fmt, ...) do { } while (0)

100 #endif

101

102 #ifdef WAITQ_VERBOSE_DEBUG

103 #define wqdbg_v(fmt, ...) \

         printf("WQ[v:%s]:  " fmt "\n", __func__, ## __VA_ARGS__)

105 #else

106 #define wqdbg_v(fmt, ...) do { } while (0)

107 #endif

108

109 #define wqinfo(fmt, ...) \

         printf("WQ[%s]: " fmt "\n", __func__,  ## __VA_ARGS__)

111

112 #define wqerr(fmt, ...) \

         printf("WQ[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__)

114

115 /*

116 * file-static functions / data

117 */

 static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,

119 uint64_t *reserved_preposts,

120 int priority, spl_t *spl);

121

 static kern_return_t waitq_select_thread_locked(struct waitq *waitq,

123 event64_t event,

124 thread_t thread, spl_t *spl);

125

 ZONE_DECLARE(waitq_set_zone, "waitq sets",

     sizeof(struct waitq_set), ZC_NOENCRYPT);

128

129 /* waitq prepost cache */

130 #define WQP_CACHE_MAX 50

131 struct wqp_cache {

132 uint64_t head;

133 unsigned int avail;

134 };

135 static struct wqp_cache PERCPU_DATA(wqp_cache);

136

137 #define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))

138 #define ROUNDDOWN(x, y) (((x)/(y))*(y))

139

140

141 #if CONFIG_LTABLE_STATS || CONFIG_WAITQ_STATS

 static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);

143 #endif

144

 LCK_GRP_DECLARE(waitq_lck_grp, "waitq");

146

147 #if __arm64__

148

149 #define waitq_lock_to(wq, to) \

150 (hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, to, &waitq_lck_grp))

151

152 #define waitq_lock_unlock(wq) \

153 (hw_unlock_bit(&(wq)->waitq_interlock, LCK_ILOCK))

154

155 #define waitq_lock_init(wq) \

156 (wq->waitq_interlock = 0)

157

158 #else

159

160 #define waitq_lock_to(wq, to) \

161 (hw_lock_to(&(wq)->waitq_interlock, to, &waitq_lck_grp))

162

163 #define waitq_lock_unlock(wq) \

164 (hw_lock_unlock(&(wq)->waitq_interlock))

165

166 #define waitq_lock_init(wq) \

167 (hw_lock_init(&(wq)->waitq_interlock))

168

169 #endif /* __arm64__ */

170

171 /*

172 * Prepost callback function for specially marked waitq sets

173 * (prepost alternative)

174 */

 extern void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *ctx);

176

177 #define DEFAULT_MIN_FREE_TABLE_ELEM 100

178 static uint32_t g_min_free_table_elem;

179 static uint32_t g_min_free_cache;

180

181

182 /* ----------------------------------------------------------------------

183 *

184 * SetID Link Table Implementation

185 *

186 * ---------------------------------------------------------------------- */

187 static struct link_table g_wqlinktable;

188

189 enum wq_link_type {

190 WQL_ALL = -1,

191 WQL_FREE = LT_FREE,

192 WQL_WQS = LT_ELEM,

193 WQL_LINK = LT_LINK,

194 };

195

196 struct waitq_link {

197 struct lt_elem wqte;

198

199 union {

200 /* wqt_type == WQL_WQS (LT_ELEM) */

201 struct {

202 struct waitq_set *wql_set;

203 /* uint64_t sl_prepost_id; */

204 } wql_wqs;

205

206 /* wqt_type == WQL_LINK (LT_LINK) */

207 struct {

208 uint64_t left_setid;

209 uint64_t right_setid;

210 } wql_link;

211 };

212 #ifdef KEEP_WAITQ_LINK_STATS

213 thread_t sl_alloc_th;

214 task_t sl_alloc_task;

215 uintptr_t sl_alloc_bt[NWAITQ_BTFRAMES];

216 uint64_t sl_alloc_ts;

217 uintptr_t sl_invalidate_bt[NWAITQ_BTFRAMES];

218 uint64_t sl_invalidate_ts;

219 uintptr_t sl_mkvalid_bt[NWAITQ_BTFRAMES];

220 uint64_t sl_mkvalid_ts;

221 uint64_t sl_free_ts;

222 #endif

223 };

224 #if !defined(KEEP_WAITQ_LINK_STATS)

 static_assert((sizeof(struct waitq_link) & (sizeof(struct waitq_link) - 1)) == 0,

226 "waitq_link struct must be a power of two!");

227 #endif

228

229 #define wql_refcnt(link) \

230 (lt_bits_refcnt((link)->wqte.lt_bits))

231

232 #define wql_type(link) \

233 (lt_bits_type((link)->wqte.lt_bits))

234

235 #define wql_mkvalid(link) \

236 do { \

237 lt_elem_mkvalid(&(link)->wqte); \

238 wql_do_mkvalid_stats(&(link)->wqte); \

239 } while (0)

240

241 #define wql_is_valid(link) \

242 lt_bits_valid((link)->wqte.lt_bits)

243

244 #define wql_setid wqte.lt_id

245

246 #define WQL_WQS_POISON ((void *)(0xf00df00d))

247 #define WQL_LINK_POISON (0x0bad0badffffffffull)

248

249 static void

 wql_poison(struct link_table *table, struct lt_elem *elem)

251 {

         struct waitq_link *link = (struct waitq_link *)elem;

253 (void)table;

254

         switch (wql_type(link)) {

256 case WQL_WQS:

257 link->wql_wqs.wql_set = WQL_WQS_POISON;

258 break;

259 case WQL_LINK:

260 link->wql_link.left_setid = WQL_LINK_POISON;

261 link->wql_link.right_setid = WQL_LINK_POISON;

262 break;

263 default:

264 break;

265 }

266 #ifdef KEEP_WAITQ_LINK_STATS

         memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));

268 link->sl_alloc_ts = 0;

         memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));

270 link->sl_mkvalid_ts = 0;

271

272 link->sl_alloc_th = THREAD_NULL;

273 /* leave the sl_alloc_task in place for debugging */

274

275 link->sl_free_ts = mach_absolute_time();

276 #endif

277 }

278

279 #ifdef KEEP_WAITQ_LINK_STATS

280 static __inline__ void

 wql_do_alloc_stats(struct lt_elem *elem)

282 {

283 if (elem) {

                 struct waitq_link *link = (struct waitq_link *)elem;

                 memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));

                 waitq_grab_backtrace(link->sl_alloc_bt, 0);

287 link->sl_alloc_th = current_thread();

288 link->sl_alloc_task = current_task();

289

                 assert(link->sl_alloc_ts == 0);

291 link->sl_alloc_ts = mach_absolute_time();

292

                 memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));

294 link->sl_invalidate_ts = 0;

295 }

296 }

297

298 static __inline__ void

 wql_do_invalidate_stats(struct lt_elem *elem)

300 {

         struct waitq_link *link = (struct waitq_link *)elem;

302

303 if (!elem) {

304 return;

305 }

306

         assert(link->sl_mkvalid_ts > 0);

308

         memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));

310 link->sl_invalidate_ts = mach_absolute_time();

         waitq_grab_backtrace(link->sl_invalidate_bt, 0);

312 }

313

314 static __inline__ void

 wql_do_mkvalid_stats(struct lt_elem *elem)

316 {

         struct waitq_link *link = (struct waitq_link *)elem;

318

319 if (!elem) {

320 return;

321 }

322

         memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));

324 link->sl_mkvalid_ts = mach_absolute_time();

         waitq_grab_backtrace(link->sl_mkvalid_bt, 0);

326 }

327 #else

328 #define wql_do_alloc_stats(e)

329 #define wql_do_invalidate_stats(e)

330 #define wql_do_mkvalid_stats(e)

331 #endif /* KEEP_WAITQ_LINK_STATS */

332

333 static void

334 wql_init(void)

335 {

         uint32_t tablesz = 0, max_links = 0;

337

         if (PE_parse_boot_argn("wql_tsize", &tablesz, sizeof(tablesz)) != TRUE) {

339 tablesz = (uint32_t)g_lt_max_tbl_size;

340 }

341

         tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);

         max_links = tablesz / sizeof(struct waitq_link);

         assert(max_links > 0 && tablesz > 0);

345

346 /* we have a restricted index range */

         if (max_links > (LT_IDX_MAX + 1)) {

348 max_links = LT_IDX_MAX + 1;

349 }

350

         wqinfo("init linktable with max:%d elements (%d bytes)",

352 max_links, tablesz);

         ltable_init(&g_wqlinktable, "wqslab.wql", max_links,

             sizeof(struct waitq_link), wql_poison);

355 }

356

357 static void

358 wql_ensure_free_space(void)

359 {

         if (g_wqlinktable.nelem - g_wqlinktable.used_elem < g_min_free_table_elem) {

361 /*

362 * we don't hold locks on these values, so check for underflow

363 */

                 if (g_wqlinktable.used_elem <= g_wqlinktable.nelem) {

                         wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d",

366 g_wqlinktable.nelem, g_wqlinktable.used_elem,

367 g_min_free_table_elem);

368 ltable_grow(&g_wqlinktable, g_min_free_table_elem);

369 }

370 }

371 }

372

373 static struct waitq_link *

374 wql_alloc_link(int type)

375 {

376 struct lt_elem *elem;

377

         elem = ltable_alloc_elem(&g_wqlinktable, type, 1, 0);

379 wql_do_alloc_stats(elem);

         return (struct waitq_link *)elem;

381 }

382

383 static void

 wql_realloc_link(struct waitq_link *link, int type)

385 {

         ltable_realloc_elem(&g_wqlinktable, &link->wqte, type);

387 #ifdef KEEP_WAITQ_LINK_STATS

         memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));

389 link->sl_alloc_ts = 0;

390 wql_do_alloc_stats(&link->wqte);

391

         memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));

393 link->sl_invalidate_ts = 0;

394 #endif

395 }

396

397 static void

 wql_invalidate(struct waitq_link *link)

399 {

400 lt_elem_invalidate(&link->wqte);

401 wql_do_invalidate_stats(&link->wqte);

402 }

403

404 static struct waitq_link *

405 wql_get_link(uint64_t setid)

406 {

407 struct lt_elem *elem;

408

         elem = ltable_get_elem(&g_wqlinktable, setid);

         return (struct waitq_link *)elem;

411 }

412

413 static void

 wql_put_link(struct waitq_link *link)

415 {

416 if (!link) {

417 return;

418 }

         ltable_put_elem(&g_wqlinktable, (struct lt_elem *)link);

420 }

421

422 static struct waitq_link *

 wql_get_reserved(uint64_t setid, int type)

424 {

425 struct lt_elem *elem;

426

         elem = lt_elem_list_first(&g_wqlinktable, setid);

428 if (!elem) {

429 return NULL;

430 }

         ltable_realloc_elem(&g_wqlinktable, elem, type);

         return (struct waitq_link *)elem;

433 }

434

435

 static inline int waitq_maybe_remove_link(struct waitq *waitq,

437 uint64_t setid,

438 struct waitq_link *parent,

439 struct waitq_link *left,

440 struct waitq_link *right);

441

442 enum {

443 LINK_WALK_ONE_LEVEL = 0,

444 LINK_WALK_FULL_DAG = 1,

445 LINK_WALK_FULL_DAG_UNLOCKED = 2,

446 };

447

 typedef int (*wql_callback_func)(struct waitq *waitq, void *ctx,

449 struct waitq_link *link);

450

451 /**

452 * walk_waitq_links: walk all table elements (of type 'link_type') pointed to by 'setid'

453 *

454 * Conditions:

455 * waitq is locked (or NULL)

456 * 'setid' is managed by 'waitq'

457 * this could be direct (waitq->waitq_set_id == setid)

458 * OR indirect (setid is the left/right ID in a LINK chain,

459 * whose root is waitq->waitq_set_id)

460 *

461 * Notes:

462 * This function uses recursion to walk the set of table elements

463 * pointed to by 'setid'. For each element encountered, 'cb' will be

464 * called. If non-zero, the return value of this callback function can

465 * early-out of the table walk.

466 *

467 * For each link element encountered, the function takes a reference to

468 * it. The reference is dropped only after the callback and any recursion

469 * has completed.

470 *

471 * The assumed table/link/tree structure:

472 * 'setid'

473 * / \

474 * / \

475 * L(LINK) R(LINK)

476 * /\ /\

477 * / \ / \

478 * / \ Rl(*) Rr(*)

479 * Ll(*) Lr(*) /\ /\

480 * /\ /\ ... ... ... ...

481 * ... ... ... ...

482 * \

483 * WQS(wqset_q.waitq_setid == Sx)

484 * [waitq set is a membet of setid, 'Sx')

485 *

486 * 'Sx'

487 * / \

488 * / \

489 * L(LINK) R(LINK)

490 * /\ /\

491 * ... ... ... ...

492 *

493 * The basic algorithm is as follows:

494 * *) take a reference to the table object pointed to by 'setid'

495 * *) if appropriate, call 'cb' (potentially early-out on non-zero return)

496 * *) if the link object points to a waitq set, and the walk type

497 * is 'FULL_DAG' (full directed-acyclic-graph), then try to lock

498 * the associated waitq set object and recursively walk all sets to

499 * which that set belongs. This is a DFS of the tree structure.

500 * *) recurse down the left side of the tree (following the

501 * 'left_setid' pointer in the link object

502 * *) recurse down the right side of the tree (following the

503 * 'right_setid' pointer in the link object

504 */

505 static __attribute__((noinline))

506 int

 walk_waitq_links(int walk_type, struct waitq *waitq,

508 uint64_t setid, int link_type,

509 void *ctx, wql_callback_func cb)

510 {

511 struct waitq_link *link;

512 uint64_t nextid;

513 int wqltype;

514

515 link = wql_get_link(setid);

516

517 /* invalid link */

518 if (!link) {

519 return WQ_ITERATE_CONTINUE;

520 }

521

522 setid = nextid = 0;

523 wqltype = wql_type(link);

524 if (wqltype == WQL_LINK) {

525 setid = link->wql_link.left_setid;

526 nextid = link->wql_link.right_setid;

527 }

528

529 /*

530 * Make the callback only on specified link_type (or all links)

531 * Note that after the callback, the link object may be

532 * invalid. The only valid thing we can do is put our

533 * reference to it (which may put it back on the free list)

534 */

         if (link_type == WQL_ALL || link_type == wqltype) {

536 /* allow the callback to early-out */

                 int ret = cb(waitq, ctx, link);

538 if (ret != WQ_ITERATE_CONTINUE) {

539 wql_put_link(link);

540 return ret;

541 }

542 }

543

544 if (wqltype == WQL_WQS &&

545 (walk_type == LINK_WALK_FULL_DAG ||

546 walk_type == LINK_WALK_FULL_DAG_UNLOCKED)) {

547 /*

548 * Recurse down any sets to which this wait queue set was

549 * added. We do this just before we put our reference to

550 * the link object (which may free it).

551 */

                 struct waitq_set *wqset = link->wql_wqs.wql_set;

553 int ret = WQ_ITERATE_CONTINUE;

554 int should_unlock = 0;

555 uint64_t wqset_setid = 0;

556

                 if (waitq_set_is_valid(wqset) && walk_type == LINK_WALK_FULL_DAG) {

                         assert(!waitq_irq_safe(&wqset->wqset_q));

559 waitq_set_lock(wqset);

560 should_unlock = 1;

561 }

562

563 /*

564 * verify the linked waitq set as it could have been

565 * invalidated before we grabbed the lock!

566 */

                 if (wqset->wqset_id != link->wql_setid.id) {

568 /* This is the bottom of the tree: just get out */

569 if (should_unlock) {

570 waitq_set_unlock(wqset);

571 }

572 wql_put_link(link);

573 return WQ_ITERATE_CONTINUE;

574 }

575

576 wqset_setid = wqset->wqset_q.waitq_set_id;

577

                 if (wqset_setid > 0) {

                         ret = walk_waitq_links(walk_type, &wqset->wqset_q,

580 wqset_setid, link_type, ctx, cb);

581 }

582 if (should_unlock) {

583 waitq_set_unlock(wqset);

584 }

585 if (ret != WQ_ITERATE_CONTINUE) {

586 wql_put_link(link);

587 return ret;

588 }

589 }

590

591 wql_put_link(link);

592

593 /* recurse down left side of the tree */

594 if (setid) {

                 int ret = walk_waitq_links(walk_type, waitq, setid, link_type, ctx, cb);

596 if (ret != WQ_ITERATE_CONTINUE) {

597 return ret;

598 }

599 }

600

601 /* recurse down right side of the tree */

602 if (nextid) {

                 return walk_waitq_links(walk_type, waitq, nextid, link_type, ctx, cb);

604 }

605

606 return WQ_ITERATE_CONTINUE;

607 }

608

609 /* ----------------------------------------------------------------------

610 *

611 * Prepost Link Table Implementation

612 *

613 * ---------------------------------------------------------------------- */

614 static struct link_table g_prepost_table;

615

616 enum wq_prepost_type {

617 WQP_FREE = LT_FREE,

618 WQP_WQ = LT_ELEM,

619 WQP_POST = LT_LINK,

620 };

621

622 struct wq_prepost {

623 struct lt_elem wqte;

624

625 union {

626 /* wqt_type == WQP_WQ (LT_ELEM) */

627 struct {

628 struct waitq *wqp_wq_ptr;

629 } wqp_wq;

630 /* wqt_type == WQP_POST (LT_LINK) */

631 struct {

632 uint64_t wqp_next_id;

633 uint64_t wqp_wq_id;

634 } wqp_post;

635 };

636 #ifdef KEEP_WAITQ_PREPOST_STATS

637 thread_t wqp_alloc_th;

638 task_t wqp_alloc_task;

639 uintptr_t wqp_alloc_bt[NWAITQ_BTFRAMES];

640 #endif

641 };

642 #if !defined(KEEP_WAITQ_PREPOST_STATS)

 static_assert((sizeof(struct wq_prepost) & (sizeof(struct wq_prepost) - 1)) == 0,

644 "wq_prepost struct must be a power of two!");

645 #endif

646

647 #define wqp_refcnt(wqp) \

648 (lt_bits_refcnt((wqp)->wqte.lt_bits))

649

650 #define wqp_type(wqp) \

651 (lt_bits_type((wqp)->wqte.lt_bits))

652

653 #define wqp_set_valid(wqp) \

654 lt_elem_mkvalid(&(wqp)->wqte)

655

656 #define wqp_is_valid(wqp) \

657 lt_bits_valid((wqp)->wqte.lt_bits)

658

659 #define wqp_prepostid wqte.lt_id

660

661 #define WQP_WQ_POISON (0x0bad0badffffffffull)

662 #define WQP_POST_POISON (0xf00df00df00df00d)

663

664 static void

 wqp_poison(struct link_table *table, struct lt_elem *elem)

666 {

         struct wq_prepost *wqp = (struct wq_prepost *)elem;

668 (void)table;

669

         switch (wqp_type(wqp)) {

671 case WQP_WQ:

672 break;

673 case WQP_POST:

674 wqp->wqp_post.wqp_next_id = WQP_POST_POISON;

675 wqp->wqp_post.wqp_wq_id = WQP_POST_POISON;

676 break;

677 default:

678 break;

679 }

680 }

681

682 #ifdef KEEP_WAITQ_PREPOST_STATS

683 static __inline__ void

 wqp_do_alloc_stats(struct lt_elem *elem)

685 {

686 if (!elem) {

687 return;

688 }

689

         struct wq_prepost *wqp = (struct wq_prepost *)elem;

         uintptr_t alloc_bt[sizeof(wqp->wqp_alloc_bt)];

692

693 waitq_grab_backtrace(alloc_bt, NWAITQ_BTFRAMES);

694

695 /* be sure the take stats for _all_ allocated objects */

696 for (;;) {

                 memcpy(wqp->wqp_alloc_bt, alloc_bt, sizeof(alloc_bt));

698 wqp->wqp_alloc_th = current_thread();

699 wqp->wqp_alloc_task = current_task();

                 wqp = (struct wq_prepost *)lt_elem_list_next(&g_prepost_table, &wqp->wqte);

701 if (!wqp) {

702 break;

703 }

704 }

705 }

706 #else

707 #define wqp_do_alloc_stats(e)

708 #endif /* KEEP_WAITQ_LINK_STATS */

709

710 static void

711 wqp_init(void)

712 {

         uint32_t tablesz = 0, max_wqp = 0;

714

         if (PE_parse_boot_argn("wqp_tsize", &tablesz, sizeof(tablesz)) != TRUE) {

716 tablesz = (uint32_t)g_lt_max_tbl_size;

717 }

718

         tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);

         max_wqp = tablesz / sizeof(struct wq_prepost);

         assert(max_wqp > 0 && tablesz > 0);

722

723 /* we have a restricted index range */

         if (max_wqp > (LT_IDX_MAX + 1)) {

725 max_wqp = LT_IDX_MAX + 1;

726 }

727

         wqinfo("init prepost table with max:%d elements (%d bytes)",

729 max_wqp, tablesz);

         ltable_init(&g_prepost_table, "wqslab.prepost", max_wqp,

             sizeof(struct wq_prepost), wqp_poison);

732 }

733

734 /*

735 * Refill the per-CPU cache.

736 */

737 static void

738 wq_prepost_refill_cpu_cache(uint32_t nalloc)

739 {

740 struct lt_elem *new_head, *old_head;

741 struct wqp_cache *cache;

742

743 /* require preemption enabled to allocate elements */

         if (get_preemption_level() != 0) {

745 return;

746 }

747

748 new_head = ltable_alloc_elem(&g_prepost_table,

749 LT_RESERVED, nalloc, 1);

750 if (new_head == NULL) {

751 return;

752 }

753

754 disable_preemption();

755 cache = PERCPU_GET(wqp_cache);

756

757 /* check once more before putting these elements on the list */

         if (cache->avail >= WQP_CACHE_MAX) {

                 lt_elem_list_release(&g_prepost_table, new_head, LT_RESERVED);

760 enable_preemption();

761 return;

762 }

763

764 cache->avail += nalloc;

         if (cache->head == 0 || cache->head == LT_IDX_MAX) {

                 cache->head = new_head->lt_id.id;

767 goto out;

768 }

769

         old_head = lt_elem_list_first(&g_prepost_table, cache->head);

         (void)lt_elem_list_link(&g_prepost_table, new_head, old_head);

         cache->head = new_head->lt_id.id;

773

774 out:

775 enable_preemption();

776 return;

777 }

778

779 static void

780 wq_prepost_ensure_free_space(void)

781 {

782 uint32_t free_elem;

783 uint32_t min_free;

784 struct wqp_cache *cache;

785

         if (g_min_free_cache == 0) {

787 g_min_free_cache = (WQP_CACHE_MAX * ml_wait_max_cpus());

788 }

789

790 /*

791 * Ensure that we always have a pool of per-CPU prepost elements

792 */

793 disable_preemption();

794 cache = PERCPU_GET(wqp_cache);

795 free_elem = cache->avail;

796 enable_preemption();

797

         if (free_elem < (WQP_CACHE_MAX / 3)) {

799 wq_prepost_refill_cpu_cache(WQP_CACHE_MAX - free_elem);

800 }

801

802 /*

803 * Now ensure that we have a sufficient amount of free table space

804 */

         free_elem = g_prepost_table.nelem - g_prepost_table.used_elem;

806 min_free = g_min_free_table_elem + g_min_free_cache;

807 if (free_elem < min_free) {

808 /*

809 * we don't hold locks on these values, so check for underflow

810 */

                 if (g_prepost_table.used_elem <= g_prepost_table.nelem) {

                         wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d+%d",

813 g_prepost_table.nelem, g_prepost_table.used_elem,

814 g_min_free_table_elem, g_min_free_cache);

815 ltable_grow(&g_prepost_table, min_free);

816 }

817 }

818 }

819

820 static struct wq_prepost *

 wq_prepost_alloc(int type, int nelem)

822 {

823 struct lt_elem *elem;

824 struct wq_prepost *wqp;

825 struct wqp_cache *cache;

826

827 if (type != LT_RESERVED) {

828 goto do_alloc;

829 }

         if (nelem == 0) {

831 return NULL;

832 }

833

834 /*

835 * First try to grab the elements from the per-CPU cache if we are

836 * allocating RESERVED elements

837 */

838 disable_preemption();

839 cache = PERCPU_GET(wqp_cache);

         if (nelem <= (int)cache->avail) {

                 struct lt_elem *first, *next = NULL;

842 int nalloc = nelem;

843

844 cache->avail -= nelem;

845

846 /* grab the first element */

                 first = lt_elem_list_first(&g_prepost_table, cache->head);

848

849 /* find the last element and re-adjust the cache head */

                 for (elem = first; elem != NULL && nalloc > 0; elem = next) {

                         next = lt_elem_list_next(&g_prepost_table, elem);

                         if (--nalloc == 0) {

853 /* terminate the allocated list */

854 elem->lt_next_idx = LT_IDX_MAX;

855 break;

856 }

857 }

                 assert(nalloc == 0);

859 if (!next) {

860 cache->head = LT_IDX_MAX;

861 } else {

                         cache->head = next->lt_id.id;

863 }

864 /* assert that we don't have mis-matched book keeping */

                 assert(!(cache->head == LT_IDX_MAX && cache->avail > 0));

866 enable_preemption();

867 elem = first;

868 goto out;

869 }

870 enable_preemption();

871

872 do_alloc:

873 /* fall-back to standard table allocation */

         elem = ltable_alloc_elem(&g_prepost_table, type, nelem, 0);

875 if (!elem) {

876 return NULL;

877 }

878

879 out:

880 wqp = (struct wq_prepost *)elem;

881 wqp_do_alloc_stats(elem);

882 return wqp;

883 }

884

885 static void

 wq_prepost_invalidate(struct wq_prepost *wqp)

887 {

888 lt_elem_invalidate(&wqp->wqte);

889 }

890

891 static struct wq_prepost *

892 wq_prepost_get(uint64_t wqp_id)

893 {

894 struct lt_elem *elem;

895

         elem = ltable_get_elem(&g_prepost_table, wqp_id);

         return (struct wq_prepost *)elem;

898 }

899

900 static void

 wq_prepost_put(struct wq_prepost *wqp)

902 {

         ltable_put_elem(&g_prepost_table, (struct lt_elem *)wqp);

904 }

905

906 static int

 wq_prepost_rlink(struct wq_prepost *parent, struct wq_prepost *child)

908 {

         return lt_elem_list_link(&g_prepost_table, &parent->wqte, &child->wqte);

910 }

911

912 static struct wq_prepost *

 wq_prepost_get_rnext(struct wq_prepost *head)

914 {

915 struct lt_elem *elem;

916 struct wq_prepost *wqp;

917 uint64_t id;

918

         elem = lt_elem_list_next(&g_prepost_table, &head->wqte);

920 if (!elem) {

921 return NULL;

922 }

923 id = elem->lt_id.id;

         elem = ltable_get_elem(&g_prepost_table, id);

925

926 if (!elem) {

927 return NULL;

928 }

929 wqp = (struct wq_prepost *)elem;

         if (elem->lt_id.id != id ||

931 wqp_type(wqp) != WQP_POST ||

             wqp->wqp_post.wqp_next_id != head->wqp_prepostid.id) {

933 ltable_put_elem(&g_prepost_table, elem);

934 return NULL;

935 }

936

937 return wqp;

938 }

939

940 static void

 wq_prepost_reset_rnext(struct wq_prepost *wqp)

942 {

         (void)lt_elem_list_break(&g_prepost_table, &wqp->wqte);

944 }

945

946

947 /**

948 * remove 'wqp' from the prepost list on 'wqset'

949 *

950 * Conditions:

951 * wqset is locked

952 * caller holds a reference on wqp (and is responsible to release it)

953 *

954 * Result:

955 * wqp is invalidated, wqset is potentially updated with a new

956 * prepost ID, and the next element of the prepost list may be

957 * consumed as well (if the list contained only 2 objects)

958 */

959 static int

 wq_prepost_remove(struct waitq_set *wqset,

961 struct wq_prepost *wqp)

962 {

963 int more_posts = 1;

         uint64_t next_id = wqp->wqp_post.wqp_next_id;

         uint64_t wqp_id = wqp->wqp_prepostid.id;

966 struct wq_prepost *prev_wqp, *next_wqp;

967

         assert(wqp_type(wqp) == WQP_POST);

         assert(wqset->wqset_q.waitq_prepost == 1);

970

971 if (next_id == wqp_id) {

972 /* the list is singular and becoming empty */

973 wqset->wqset_prepost_id = 0;

974 more_posts = 0;

975 goto out;

976 }

977

978 prev_wqp = wq_prepost_get_rnext(wqp);

979 assert(prev_wqp != NULL);

         assert(prev_wqp->wqp_post.wqp_next_id == wqp_id);

         assert(prev_wqp->wqp_prepostid.id != wqp_id);

         assert(wqp_type(prev_wqp) == WQP_POST);

983

         if (prev_wqp->wqp_prepostid.id == next_id) {

985 /*

986 * There are two items in the list, and we're removing one. We

987 * only need to keep the WQP_WQ pointer from 'prev_wqp'

988 */

                 wqset->wqset_prepost_id = prev_wqp->wqp_post.wqp_wq_id;

990 wq_prepost_invalidate(prev_wqp);

991 wq_prepost_put(prev_wqp);

992 more_posts = 0;

993 goto out;

994 }

995

996 /* prev->next = next */

997 prev_wqp->wqp_post.wqp_next_id = next_id;

998

999 /* next->prev = prev */

1000 next_wqp = wq_prepost_get(next_id);

1001 assert(next_wqp != NULL);

1002 assert(next_wqp != wqp);

1003 assert(next_wqp != prev_wqp);

         assert(wqp_type(next_wqp) == WQP_POST);

1005

1006 wq_prepost_reset_rnext(next_wqp);

1007 wq_prepost_rlink(next_wqp, prev_wqp);

1008

1009 /* If we remove the head of the list, update the wqset */

         if (wqp_id == wqset->wqset_prepost_id) {

1011 wqset->wqset_prepost_id = next_id;

1012 }

1013

1014 wq_prepost_put(prev_wqp);

1015 wq_prepost_put(next_wqp);

1016

1017 out:

1018 wq_prepost_reset_rnext(wqp);

1019 wq_prepost_invalidate(wqp);

1020 return more_posts;

1021 }

1022

1023 static struct wq_prepost *

1024 wq_prepost_rfirst(uint64_t id)

1025 {

1026 struct lt_elem *elem;

         elem = lt_elem_list_first(&g_prepost_table, id);

1028 wqp_do_alloc_stats(elem);

         return (struct wq_prepost *)(void *)elem;

1030 }

1031

1032 static struct wq_prepost *

 wq_prepost_rpop(uint64_t *id, int type)

1034 {

1035 struct lt_elem *elem;

         elem = lt_elem_list_pop(&g_prepost_table, id, type);

1037 wqp_do_alloc_stats(elem);

         return (struct wq_prepost *)(void *)elem;

1039 }

1040

1041 static void

 wq_prepost_release_rlist(struct wq_prepost *wqp)

1043 {

1044 int nelem = 0;

1045 struct wqp_cache *cache;

1046 struct lt_elem *elem;

1047

1048 if (!wqp) {

1049 return;

1050 }

1051

1052 elem = &wqp->wqte;

1053

1054 /*

1055 * These are reserved elements: release them back to the per-cpu pool

1056 * if our cache is running low.

1057 */

1058 disable_preemption();

1059 cache = PERCPU_GET(wqp_cache);

         if (cache->avail < WQP_CACHE_MAX) {

1061 struct lt_elem *tmp = NULL;

                 if (cache->head != LT_IDX_MAX) {

                         tmp = lt_elem_list_first(&g_prepost_table, cache->head);

1064 }

                 nelem = lt_elem_list_link(&g_prepost_table, elem, tmp);

                 cache->head = elem->lt_id.id;

1067 cache->avail += nelem;

1068 enable_preemption();

1069 return;

1070 }

1071 enable_preemption();

1072

1073 /* release these elements back to the main table */

         nelem = lt_elem_list_release(&g_prepost_table, elem, LT_RESERVED);

1075

1076 #if CONFIG_WAITQ_STATS

1077 g_prepost_table.nreserved_releases += 1;

1078 OSDecrementAtomic64(&g_prepost_table.nreservations);

1079 #endif

1080 }

1081

 typedef int (*wqp_callback_func)(struct waitq_set *wqset,

1083 void *ctx,

1084 struct wq_prepost *wqp,

1085 struct waitq *waitq);

1086

1087 /**

1088 * iterate over a chain of preposts associated with a waitq set.

1089 *

1090 * Conditions:

1091 * wqset is locked

1092 *

1093 * Notes:

1094 * This loop performs automatic prepost chain management / culling, and

1095 * may reset or adjust the waitq set's prepost ID pointer. If you don't

1096 * want this extra processing, you can use wq_prepost_iterate().

1097 */

1098 static int

 wq_prepost_foreach_locked(struct waitq_set *wqset,

1100 void *ctx, wqp_callback_func cb)

1101 {

1102 int ret = WQ_ITERATE_SUCCESS;

1103 struct wq_prepost *wqp, *tmp_wqp;

1104

1105 assert(cb != NULL);

1106

         if (!wqset || !waitq_set_maybe_preposted(wqset)) {

1108 return WQ_ITERATE_SUCCESS;

1109 }

1110

1111 restart:

         wqp = wq_prepost_get(wqset->wqset_prepost_id);

1113 if (!wqp) {

1114 /*

1115 * The prepost object is no longer valid, reset the waitq

1116 * set's prepost id.

1117 */

1118 wqset->wqset_prepost_id = 0;

1119 return WQ_ITERATE_SUCCESS;

1120 }

1121

         if (wqp_type(wqp) == WQP_WQ) {

                 uint64_t __assert_only wqp_id = wqp->wqp_prepostid.id;

1124

                 ret = cb(wqset, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);

1126

1127 switch (ret) {

1128 case WQ_ITERATE_INVALIDATE_CONTINUE:

1129 /* the caller wants to remove the only prepost here */

                         assert(wqp_id == wqset->wqset_prepost_id);

1131 wqset->wqset_prepost_id = 0;

1132 OS_FALLTHROUGH;

1133 case WQ_ITERATE_CONTINUE:

1134 wq_prepost_put(wqp);

1135 ret = WQ_ITERATE_SUCCESS;

1136 break;

1137 case WQ_ITERATE_RESTART:

1138 wq_prepost_put(wqp);

1139 OS_FALLTHROUGH;

1140 case WQ_ITERATE_DROPPED:

1141 goto restart;

1142 default:

1143 wq_prepost_put(wqp);

1144 break;

1145 }

1146 return ret;

1147 }

1148

         assert(wqp->wqp_prepostid.id == wqset->wqset_prepost_id);

         assert(wqp_type(wqp) == WQP_POST);

1151

1152 /*

1153 * At this point we know we have a list of POST objects.

1154 * Grab a handle to the last element in the list and start

1155 * the iteration.

1156 */

1157 tmp_wqp = wq_prepost_get_rnext(wqp);

         assert(tmp_wqp != NULL && wqp_type(tmp_wqp) == WQP_POST);

1159

         uint64_t last_id = tmp_wqp->wqp_prepostid.id;

1161 wq_prepost_put(tmp_wqp);

1162

1163 ret = WQ_ITERATE_SUCCESS;

1164 for (;;) {

1165 uint64_t wqp_id, first_id, next_id;

1166

1167 wqp_id = wqp->wqp_prepostid.id;

1168 first_id = wqset->wqset_prepost_id;

1169 next_id = wqp->wqp_post.wqp_next_id;

1170

1171 /* grab the WQP_WQ object this _POST points to */

                 tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);

1173 if (!tmp_wqp) {

1174 /*

1175 * This WQP_POST object points to an invalid

1176 * WQP_WQ object - remove the POST object from

1177 * the list.

1178 */

                         if (wq_prepost_remove(wqset, wqp) == 0) {

1180 wq_prepost_put(wqp);

1181 goto restart;

1182 }

1183 goto next_prepost;

1184 }

                 assert(wqp_type(tmp_wqp) == WQP_WQ);

1186 /*

1187 * make the callback: note that this could remove 'wqp' or

1188 * drop the lock on our waitq set. We need to re-validate

1189 * our state when this function returns.

1190 */

                 ret = cb(wqset, ctx, wqp, tmp_wqp->wqp_wq.wqp_wq_ptr);

1192 wq_prepost_put(tmp_wqp);

1193

1194 switch (ret) {

1195 case WQ_ITERATE_CONTINUE:

1196 /* continue iteration */

1197 break;

1198 case WQ_ITERATE_INVALIDATE_CONTINUE:

                         assert(next_id == wqp->wqp_post.wqp_next_id);

                         if (wq_prepost_remove(wqset, wqp) == 0) {

1201 wq_prepost_put(wqp);

1202 goto restart;

1203 }

1204 goto next_prepost;

1205 case WQ_ITERATE_RESTART:

1206 wq_prepost_put(wqp);

1207 OS_FALLTHROUGH;

1208 case WQ_ITERATE_DROPPED:

1209 /* the callback dropped the ref to wqp: just restart */

1210 goto restart;

1211 default:

1212 /* break out of the iteration for some other reason */

1213 goto finish_prepost_foreach;

1214 }

1215

1216 /*

1217 * the set lock may have been dropped during callback,

1218 * if something looks different, restart the prepost iteration

1219 */

                 if (!wqp_is_valid(wqp) ||

                     (wqp->wqp_post.wqp_next_id != next_id) ||

1222 wqset->wqset_prepost_id != first_id) {

1223 wq_prepost_put(wqp);

1224 goto restart;

1225 }

1226

1227 next_prepost:

1228 /* this was the last object in the list */

1229 if (wqp_id == last_id) {

1230 break;

1231 }

1232

1233 /* get the next object */

1234 tmp_wqp = wq_prepost_get(next_id);

1235 if (!tmp_wqp) {

1236 /*

1237 * At this point we've already checked our state

1238 * after the callback (which may have dropped the set

1239 * lock). If we find an invalid member of the list

1240 * then something is wrong.

1241 */

                         panic("Invalid WQP_POST member 0x%llx in waitq set "

                             "0x%llx prepost list (first:%llx, "

1244 "wqp:%p)",

                             next_id, wqset->wqset_id, first_id, wqp);

1246 }

1247 wq_prepost_put(wqp);

1248 wqp = tmp_wqp;

1249

                 assert(wqp_type(wqp) == WQP_POST);

1251 }

1252

1253 finish_prepost_foreach:

1254 wq_prepost_put(wqp);

1255 if (ret == WQ_ITERATE_CONTINUE) {

1256 ret = WQ_ITERATE_SUCCESS;

1257 }

1258

1259 return ret;

1260 }

1261

1262 /**

1263 * Perform a simple loop over a chain of prepost objects

1264 *

1265 * Conditions:

1266 * If 'prepost_id' is associated with a waitq (set) then that object must

1267 * be locked before calling this function.

1268 * Callback function, 'cb', must be able to handle a NULL wqset pointer

1269 * and a NULL waitq pointer!

1270 *

1271 * Notes:

1272 * This prepost chain iteration will _not_ automatically adjust any chain

1273 * element or linkage. This is the responsibility of the caller! If you

1274 * want automatic prepost chain management (at a cost of extra CPU time),

1275 * you can use: wq_prepost_foreach_locked().

1276 */

1277 static int

1278 wq_prepost_iterate(uint64_t prepost_id,

1279 void *ctx, wqp_callback_func cb)

1280 {

1281 int ret;

1282 struct wq_prepost *wqp;

1283

1284 if (!prepost_id) {

1285 return WQ_ITERATE_SUCCESS;

1286 }

1287

1288 wqp = wq_prepost_get(prepost_id);

1289 if (!wqp) {

1290 return WQ_ITERATE_SUCCESS;

1291 }

1292

         if (wqp_type(wqp) == WQP_WQ) {

1294 ret = WQ_ITERATE_SUCCESS;

1295 if (cb) {

                         ret = cb(NULL, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);

1297 }

1298

1299 if (ret != WQ_ITERATE_DROPPED) {

1300 wq_prepost_put(wqp);

1301 }

1302 return ret;

1303 }

1304

         assert(wqp->wqp_prepostid.id == prepost_id);

         assert(wqp_type(wqp) == WQP_POST);

1307

1308 /* at this point we know we have a list of POST objects */

1309 uint64_t next_id;

1310

1311 ret = WQ_ITERATE_CONTINUE;

1312 do {

1313 struct wq_prepost *tmp_wqp;

1314 struct waitq *wq = NULL;

1315

1316 next_id = wqp->wqp_post.wqp_next_id;

1317

1318 /* grab the WQP_WQ object this _POST points to */

                 tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);

1320 if (tmp_wqp) {

                         assert(wqp_type(tmp_wqp) == WQP_WQ);

1322 wq = tmp_wqp->wqp_wq.wqp_wq_ptr;

1323 }

1324

1325 if (cb) {

                         ret = cb(NULL, ctx, wqp, wq);

1327 }

1328 if (tmp_wqp) {

1329 wq_prepost_put(tmp_wqp);

1330 }

1331

1332 if (ret != WQ_ITERATE_CONTINUE) {

1333 break;

1334 }

1335

1336 tmp_wqp = wq_prepost_get(next_id);

1337 if (!tmp_wqp) {

1338 /*

1339 * the chain is broken: nothing we can do here besides

1340 * bail from the iteration.

1341 */

1342 ret = WQ_ITERATE_ABORTED;

1343 break;

1344 }

1345

1346 wq_prepost_put(wqp);

1347 wqp = tmp_wqp;

1348

                 assert(wqp_type(wqp) == WQP_POST);

         } while (next_id != prepost_id);

1351

1352 if (ret != WQ_ITERATE_DROPPED) {

1353 wq_prepost_put(wqp);

1354 }

1355

1356 if (ret == WQ_ITERATE_CONTINUE) {

1357 ret = WQ_ITERATE_SUCCESS;

1358 }

1359 return ret;

1360 }

1361

1362

1363 struct _is_posted_ctx {

1364 struct waitq *posting_wq;

1365 int did_prepost;

1366 };

1367

1368 static int

 wq_is_preposted_on_set_cb(struct waitq_set *wqset, void *ctx,

     struct wq_prepost *wqp, struct waitq *waitq)

1371 {

         struct _is_posted_ctx *pctx = (struct _is_posted_ctx *)ctx;

1373

1374 (void)wqset;

1375 (void)wqp;

1376

1377 /*

1378 * Don't early-out, run through the _entire_ list:

1379 * This ensures that we retain a minimum number of invalid elements.

1380 */

         if (pctx->posting_wq == waitq) {

1382 pctx->did_prepost = 1;

1383 }

1384

1385 return WQ_ITERATE_CONTINUE;

1386 }

1387

1388

1389 /**

1390 * checks if 'waitq' has already preposted on 'wqset'

1391 *

1392 * Parameters:

1393 * waitq The waitq that's preposting

1394 * wqset The set onto which waitq may be preposted

1395 *

1396 * Conditions:

1397 * both waitq and wqset are locked

1398 *

1399 * Returns non-zero if 'waitq' has already preposted to 'wqset'

1400 */

1401 static int

 wq_is_preposted_on_set(struct waitq *waitq, struct waitq_set *wqset)

1403 {

1404 int ret;

1405 struct _is_posted_ctx pctx;

1406

1407 /*

1408 * If the set's only prepost matches the waitq's prepost ID,

1409 * then it obviously already preposted to the set.

1410 */

         if (waitq->waitq_prepost_id != 0 &&

1412 wqset->wqset_prepost_id == waitq->waitq_prepost_id) {

1413 return 1;

1414 }

1415

1416 /* use full prepost iteration: always trim the list */

1417 pctx.posting_wq = waitq;

1418 pctx.did_prepost = 0;

         ret = wq_prepost_foreach_locked(wqset, (void *)&pctx,

1420 wq_is_preposted_on_set_cb);

1421 return pctx.did_prepost;

1422 }

1423

1424 static struct wq_prepost *

 wq_get_prepost_obj(uint64_t *reserved, int type)

1426 {

1427 struct wq_prepost *wqp = NULL;

1428 /*

1429 * don't fail just because the caller doesn't have enough

1430 * reservations, we've kept a low-water mark on the prepost table,

1431 * so there should be some available for us.

1432 */

1433 if (reserved && *reserved) {

                 wqp = wq_prepost_rpop(reserved, type);

                 assert(wqp->wqte.lt_id.idx < g_prepost_table.nelem);

1436 } else {

1437 /*

1438 * TODO: if in interrupt context, grab from a special

1439 * region / reserved list!

1440 */

                 wqp = wq_prepost_alloc(type, 1);

1442 }

1443

1444 if (wqp == NULL) {

1445 panic("Couldn't allocate prepost object!");

1446 }

1447 return wqp;

1448 }

1449

1450

1451 /**

1452 * prepost a waitq onto a waitq set

1453 *

1454 * Parameters:

1455 * wqset The set onto which waitq will be preposted

1456 * waitq The waitq that's preposting

1457 * reserved List (lt_elem_list_ style) of pre-allocated prepost elements

1458 * Could be NULL

1459 *

1460 * Conditions:

1461 * both wqset and waitq are locked

1462 *

1463 * Notes:

1464 * If reserved is NULL, this may block on prepost table growth.

1465 */

1466 static void

 wq_prepost_do_post_locked(struct waitq_set *wqset,

1468 struct waitq *waitq,

1469 uint64_t *reserved)

1470 {

         struct wq_prepost *wqp_post, *wqp_head, *wqp_tail;

1472

         assert(waitq_held(waitq) && waitq_held(&wqset->wqset_q));

1474

1475 /*

1476 * nothing to do if it's already preposted:

1477 * note that this also culls any invalid prepost objects

1478 */

         if (wq_is_preposted_on_set(waitq, wqset)) {

1480 return;

1481 }

1482

         assert(waitqs_is_linked(wqset));

1484

1485 /*

1486 * This function is called because an event is being posted to 'waitq'.

1487 * We need a prepost object associated with this queue. Allocate one

1488 * now if the waitq isn't already associated with one.

1489 */

         if (waitq->waitq_prepost_id == 0) {

1491 struct wq_prepost *wqp;

                 wqp = wq_get_prepost_obj(reserved, WQP_WQ);

1493 wqp->wqp_wq.wqp_wq_ptr = waitq;

1494 wqp_set_valid(wqp);

                 waitq->waitq_prepost_id = wqp->wqp_prepostid.id;

1496 wq_prepost_put(wqp);

1497 }

1498

1499 #if CONFIG_LTABLE_STATS

1500 g_prepost_table.npreposts += 1;

1501 #endif

1502

         wqdbg_v("preposting waitq %p (0x%llx) to set 0x%llx",

             (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq),

1505 waitq->waitq_prepost_id, wqset->wqset_id);

1506

         if (wqset->wqset_prepost_id == 0) {

1508 /* the set has no previous preposts */

1509 wqset->wqset_prepost_id = waitq->waitq_prepost_id;

1510 return;

1511 }

1512

         wqp_head = wq_prepost_get(wqset->wqset_prepost_id);

1514 if (!wqp_head) {

1515 /* the previous prepost has become invalid */

1516 wqset->wqset_prepost_id = waitq->waitq_prepost_id;

1517 return;

1518 }

1519

         assert(wqp_head->wqp_prepostid.id == wqset->wqset_prepost_id);

1521

1522 /*

1523 * If we get here, we're going to need at least one new wq_prepost

1524 * object. If the previous wqset_prepost_id points to a WQP_WQ, we

1525 * actually need to allocate 2 wq_prepost objects because the WQP_WQ

1526 * is tied to the waitq and shared across all sets.

1527 */

         wqp_post = wq_get_prepost_obj(reserved, WQP_POST);

1529

         wqp_post->wqp_post.wqp_wq_id = waitq->waitq_prepost_id;

         wqdbg_v("POST 0x%llx :: WQ 0x%llx", wqp_post->wqp_prepostid.id,

1532 waitq->waitq_prepost_id);

1533

         if (wqp_type(wqp_head) == WQP_WQ) {

1535 /*

1536 * We must replace the wqset_prepost_id with a pointer

1537 * to two new WQP_POST objects

1538 */

                 uint64_t wqp_id = wqp_head->wqp_prepostid.id;

                 wqdbg_v("set 0x%llx previous had 1 WQ prepost (0x%llx): "

1541 "replacing with two POST preposts",

1542 wqset->wqset_id, wqp_id);

1543

1544 /* drop the old reference */

1545 wq_prepost_put(wqp_head);

1546

1547 /* grab another new object (the 2nd of two) */

                 wqp_head = wq_get_prepost_obj(reserved, WQP_POST);

1549

1550 /* point this one to the original WQP_WQ object */

1551 wqp_head->wqp_post.wqp_wq_id = wqp_id;

                 wqdbg_v("POST 0x%llx :: WQ 0x%llx",

1553 wqp_head->wqp_prepostid.id, wqp_id);

1554

1555 /* link it to the new wqp_post object allocated earlier */

                 wqp_head->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;

1557 /* make the list a double-linked and circular */

1558 wq_prepost_rlink(wqp_head, wqp_post);

1559

1560 /*

1561 * Finish setting up the new prepost: point it back to the

1562 * POST object we allocated to replace the original wqset

1563 * WQ prepost object

1564 */

                 wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;

1566 wq_prepost_rlink(wqp_post, wqp_head);

1567

1568 /* mark objects valid, and reset the wqset prepost list head */

1569 wqp_set_valid(wqp_head);

1570 wqp_set_valid(wqp_post);

                 wqset->wqset_prepost_id = wqp_head->wqp_prepostid.id;

1572

1573 /* release both references */

1574 wq_prepost_put(wqp_head);

1575 wq_prepost_put(wqp_post);

1576

                 wqdbg_v("set 0x%llx: 0x%llx/0x%llx -> 0x%llx/0x%llx -> 0x%llx",

1578 wqset->wqset_id, wqset->wqset_prepost_id,

                     wqp_head->wqp_prepostid.id, wqp_head->wqp_post.wqp_next_id,

1580 wqp_post->wqp_prepostid.id,

1581 wqp_post->wqp_post.wqp_next_id);

1582 return;

1583 }

1584

         assert(wqp_type(wqp_head) == WQP_POST);

1586

1587 /*

1588 * Add the new prepost to the end of the prepost list

1589 */

1590 wqp_tail = wq_prepost_get_rnext(wqp_head);

1591 assert(wqp_tail != NULL);

         assert(wqp_tail->wqp_post.wqp_next_id == wqset->wqset_prepost_id);

1593

1594 /*

1595 * link the head to the new tail

1596 * NOTE: this needs to happen first in case wqp_tail == wqp_head

1597 */

1598 wq_prepost_reset_rnext(wqp_head);

1599 wq_prepost_rlink(wqp_head, wqp_post);

1600

1601 /* point the new object to the list head, and list tail */

         wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;

1603 wq_prepost_rlink(wqp_post, wqp_tail);

1604

1605 /* point the last item in the waitq set's list to the new object */

         wqp_tail->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;

1607

1608 wqp_set_valid(wqp_post);

1609

1610 wq_prepost_put(wqp_head);

1611 wq_prepost_put(wqp_tail);

1612 wq_prepost_put(wqp_post);

1613

         wqdbg_v("set 0x%llx (wqp:0x%llx) last_prepost:0x%llx, "

             "new_prepost:0x%llx->0x%llx", wqset->wqset_id,

             wqset->wqset_prepost_id, wqp_head->wqp_prepostid.id,

             wqp_post->wqp_prepostid.id, wqp_post->wqp_post.wqp_next_id);

1618

1619 return;

1620 }

1621

1622

1623 /* ----------------------------------------------------------------------

1624 *

1625 * Stats collection / reporting

1626 *

1627 * ---------------------------------------------------------------------- */

1628 #if CONFIG_LTABLE_STATS && CONFIG_WAITQ_STATS

1629 static void

 wq_table_stats(struct link_table *table, struct wq_table_stats *stats)

1631 {

1632 stats->version = WAITQ_STATS_VERSION;

1633 stats->table_elements = table->nelem;

1634 stats->table_used_elems = table->used_elem;

1635 stats->table_elem_sz = table->elem_sz;

1636 stats->table_slabs = table->nslabs;

1637 stats->table_slab_sz = table->slab_sz;

1638

1639 stats->table_num_allocs = table->nallocs;

1640 stats->table_num_preposts = table->npreposts;

1641 stats->table_num_reservations = table->nreservations;

1642

1643 stats->table_max_used = table->max_used;

1644 stats->table_avg_used = table->avg_used;

1645 stats->table_max_reservations = table->max_reservations;

1646 stats->table_avg_reservations = table->avg_reservations;

1647 }

1648

1649 void

 waitq_link_stats(struct wq_table_stats *stats)

1651 {

1652 if (!stats) {

1653 return;

1654 }

1655 wq_table_stats(&g_wqlinktable, stats);

1656 }

1657

1658 void

 waitq_prepost_stats(struct wq_table_stats *stats)

1660 {

1661 wq_table_stats(&g_prepost_table, stats);

1662 }

1663 #endif

1664

1665

1666 /* ----------------------------------------------------------------------

1667 *

1668 * Global Wait Queues

1669 *

1670 * ---------------------------------------------------------------------- */

1671

1672 static struct waitq g_boot_waitq;

1673 static struct waitq *global_waitqs = &g_boot_waitq;

1674 static uint32_t g_num_waitqs = 1;

1675

1676 /*

1677 * Zero out the used MSBs of the event.

1678 */

1679 #define _CAST_TO_EVENT_MASK(event) ((uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))

1680

1681 static __inline__ uint32_t

 waitq_hash(char *key, size_t length)

1683 {

         uint32_t hash = os_hash_jenkins(key, length);

1685

1686 hash &= (g_num_waitqs - 1);

1687 return hash;

1688 }

1689

1690 /* return a global waitq pointer corresponding to the given event */

1691 struct waitq *

 _global_eventq(char *event, size_t event_length)

1693 {

         return &global_waitqs[waitq_hash(event, event_length)];

1695 }

1696

1697 /* return an indexed global waitq pointer */

1698 struct waitq *

1699 global_waitq(int index)

1700 {

         return &global_waitqs[index % g_num_waitqs];

1702 }

1703

1704

1705 #if CONFIG_LTABLE_STATS || CONFIG_WAITQ_STATS

1706 /* this global is for lldb */

1707 const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;

1708

1709 static __inline__ void

 waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)

1711 {

1712 uintptr_t buf[NWAITQ_BTFRAMES + skip];

         if (skip < 0) {

1714 skip = 0;

1715 }

         memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));

         backtrace(buf, g_nwaitq_btframes + skip, NULL);

         memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));

1719 }

1720 #else /* no stats */

1721 #define waitq_grab_backtrace(...)

1722 #endif

1723

1724 #if CONFIG_WAITQ_STATS

1725

1726 struct wq_stats g_boot_stats;

1727 struct wq_stats *g_waitq_stats = &g_boot_stats;

1728

1729 static __inline__ struct wq_stats *

 waitq_global_stats(struct waitq *waitq)

1731 {

1732 struct wq_stats *wqs;

1733 uint32_t idx;

1734

         if (!waitq_is_global(waitq)) {

1736 return NULL;

1737 }

1738

         idx = (uint32_t)(((uintptr_t)waitq - (uintptr_t)global_waitqs) / sizeof(*waitq));

1740 assert(idx < g_num_waitqs);

1741 wqs = &g_waitq_stats[idx];

1742 return wqs;

1743 }

1744

1745 static __inline__ void

 waitq_stats_count_wait(struct waitq *waitq)

1747 {

         struct wq_stats *wqs = waitq_global_stats(waitq);

1749 if (wqs != NULL) {

1750 wqs->waits++;

                 waitq_grab_backtrace(wqs->last_wait, 2);

1752 }

1753 }

1754

1755 static __inline__ void

 waitq_stats_count_wakeup(struct waitq *waitq)

1757 {

         struct wq_stats *wqs = waitq_global_stats(waitq);

1759 if (wqs != NULL) {

1760 wqs->wakeups++;

                 waitq_grab_backtrace(wqs->last_wakeup, 2);

1762 }

1763 }

1764

1765 static __inline__ void

 waitq_stats_count_clear_wakeup(struct waitq *waitq)

1767 {

         struct wq_stats *wqs = waitq_global_stats(waitq);

1769 if (wqs != NULL) {

1770 wqs->wakeups++;

1771 wqs->clears++;

                 waitq_grab_backtrace(wqs->last_wakeup, 2);

1773 }

1774 }

1775

1776 static __inline__ void

 waitq_stats_count_fail(struct waitq *waitq)

1778 {

         struct wq_stats *wqs = waitq_global_stats(waitq);

1780 if (wqs != NULL) {

1781 wqs->failed_wakeups++;

                 waitq_grab_backtrace(wqs->last_failed_wakeup, 2);

1783 }

1784 }

1785 #else /* !CONFIG_WAITQ_STATS */

1786 #define waitq_stats_count_wait(q) do { } while (0)

1787 #define waitq_stats_count_wakeup(q) do { } while (0)

1788 #define waitq_stats_count_clear_wakeup(q) do { } while (0)

1789 #define waitq_stats_count_fail(q) do { } while (0)

1790 #endif

1791

1792 int

 waitq_is_valid(struct waitq *waitq)

1794 {

         return (waitq != NULL) && waitq->waitq_isvalid;

1796 }

1797

1798 int

 waitq_set_is_valid(struct waitq_set *wqset)

1800 {

         return (wqset != NULL) && wqset->wqset_q.waitq_isvalid && waitqs_is_set(wqset);

1802 }

1803

1804 int

 waitq_is_global(struct waitq *waitq)

1806 {

         if (waitq >= global_waitqs && waitq < global_waitqs + g_num_waitqs) {

1808 return 1;

1809 }

1810 return 0;

1811 }

1812

1813 int

 waitq_irq_safe(struct waitq *waitq)

1815 {

1816 /* global wait queues have this bit set on initialization */

1817 return waitq->waitq_irq;

1818 }

1819

1820 static inline bool

 waitq_empty(struct waitq *wq)

1822 {

         if (waitq_is_turnstile_queue(wq)) {

                 return priority_queue_empty(&wq->waitq_prio_queue);

         } else if (waitq_is_turnstile_proxy(wq)) {

                 struct turnstile *ts = wq->waitq_ts;

1827 return ts == TURNSTILE_NULL ||

                        priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);

1829 } else {

                 return queue_empty(&wq->waitq_queue);

1831 }

1832 }

1833

1834 static struct waitq *

 waitq_get_safeq(struct waitq *waitq)

1836 {

1837 /* Check if it's a port waitq */

         if (waitq_is_turnstile_proxy(waitq)) {

                 struct turnstile *ts = waitq->waitq_ts;

                 return ts ? &ts->ts_waitq : NULL;

1841 }

1842 return global_eventq(waitq);

1843 }

1844

1845 static uint32_t

1846 waitq_hash_size(void)

1847 {

1848 uint32_t hsize, queues;

1849

         if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize))) {

1851 return hsize;

1852 }

1853

1854 queues = thread_max / 5;

         hsize = P2ROUNDUP(queues * sizeof(struct waitq), PAGE_SIZE);

1856

1857 return hsize;

1858 }

1859

1860 /*

1861 * Since the priority ordered waitq uses basepri as the

1862 * ordering key assert that this value fits in a uint8_t.

1863 */

1864 static_assert(MAXPRI <= UINT8_MAX);

1865

1866 static inline void

 waitq_thread_insert(struct waitq *wq,

1868 thread_t thread, boolean_t fifo)

1869 {

         if (waitq_is_turnstile_queue(wq)) {

                 turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);

1872 turnstile_waitq_add_thread_priority_queue(wq, thread);

1873 } else {

                 turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);

1875 if (fifo) {

                         enqueue_tail(&wq->waitq_queue, &thread->wait_links);

1877 } else {

                         enqueue_head(&wq->waitq_queue, &thread->wait_links);

1879 }

1880 }

1881 }

1882

1883 static inline void

 waitq_thread_remove(struct waitq *wq,

1885 thread_t thread)

1886 {

         if (waitq_is_turnstile_queue(wq)) {

1888 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,

                     (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,

                     VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),

1891 thread_tid(thread),

                     0, 0, 0);

                 priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links);

1894 } else {

1895 remqueue(&(thread->wait_links));

1896 }

1897 }

1898

1899 void

1900 waitq_bootstrap(void)

1901 {

1902 kern_return_t kret;

1903 uint32_t whsize, qsz, tmp32;

1904

1905 g_min_free_table_elem = DEFAULT_MIN_FREE_TABLE_ELEM;

         if (PE_parse_boot_argn("wqt_min_free", &tmp32, sizeof(tmp32)) == TRUE) {

1907 g_min_free_table_elem = tmp32;

1908 }

         wqdbg("Minimum free table elements: %d", tmp32);

1910

1911 /*

1912 * Determine the amount of memory we're willing to reserve for

1913 * the waitqueue hash table

1914 */

1915 whsize = waitq_hash_size();

1916

1917 /* Determine the number of waitqueues we can fit. */

         qsz = sizeof(struct waitq);

         whsize = ROUNDDOWN(whsize, qsz);

1920 g_num_waitqs = whsize / qsz;

1921

1922 /*

1923 * The hash algorithm requires that this be a power of 2, so we

1924 * just mask off all the low-order bits.

1925 */

         for (uint32_t i = 0; i < 31; i++) {

                 uint32_t bit = (1 << i);

                 if ((g_num_waitqs & bit) == g_num_waitqs) {

1929 break;

1930 }

1931 g_num_waitqs &= ~bit;

1932 }

         assert(g_num_waitqs > 0);

1934

1935 /* Now determine how much memory we really need. */

         whsize = P2ROUNDUP(g_num_waitqs * qsz, PAGE_SIZE);

1937

         wqdbg("allocating %d global queues  (%d bytes)", g_num_waitqs, whsize);

         kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&global_waitqs,

             whsize, 0, KMA_KOBJECT | KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);

         if (kret != KERN_SUCCESS || global_waitqs == NULL) {

1942 panic("kernel_memory_allocate() failed to alloc global_waitqs"

                     ", error: %d, whsize: 0x%x", kret, whsize);

1944 }

1945

1946 #if CONFIG_WAITQ_STATS

         whsize = P2ROUNDUP(g_num_waitqs * sizeof(struct wq_stats), PAGE_SIZE);

         kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&g_waitq_stats,

             whsize, 0, KMA_KOBJECT | KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);

         if (kret != KERN_SUCCESS || global_waitqs == NULL) {

1951 panic("kernel_memory_allocate() failed to alloc g_waitq_stats"

                     ", error: %d, whsize: 0x%x", kret, whsize);

1953 }

         memset(g_waitq_stats, 0, whsize);

1955 #endif

1956

         for (uint32_t i = 0; i < g_num_waitqs; i++) {

                 waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO | SYNC_POLICY_DISABLE_IRQ);

1959 }

1960

1961 /* initialize the global waitq link table */

1962 wql_init();

1963

1964 /* initialize the global waitq prepost table */

1965 wqp_init();

1966 }

1967

1968

1969 /* ----------------------------------------------------------------------

1970 *

1971 * Wait Queue Implementation

1972 *

1973 * ---------------------------------------------------------------------- */

1974

1975 /*

1976 * Double the standard lock timeout, because wait queues tend

1977 * to iterate over a number of threads - locking each. If there is

1978 * a problem with a thread lock, it normally times out at the wait

1979 * queue level first, hiding the real problem.

1980 */

1981 /* For x86, the hardware timeout is in TSC units. */

1982 #if defined(__i386__) || defined(__x86_64__)

1983 #define hwLockTimeOut LockTimeOutTSC

1984 #else

1985 #define hwLockTimeOut LockTimeOut

1986 #endif

1987

1988 void

 waitq_lock(struct waitq *wq)

1990 {

         if (__improbable(waitq_lock_to(wq,

             hwLockTimeOut * 2) == 0)) {

1993 boolean_t wql_acquired = FALSE;

1994

1995 while (machine_timeout_suspended()) {

1996 mp_enable_preemption();

1997 wql_acquired = waitq_lock_to(wq,

1998 hwLockTimeOut * 2);

1999 if (wql_acquired) {

2000 break;

2001 }

2002 }

2003 if (wql_acquired == FALSE) {

                         panic("waitq deadlock - waitq=%p, cpu=%d\n",

2005 wq, cpu_number());

2006 }

2007 }

2008 #if defined(__x86_64__)

2009 pltrace(FALSE);

2010 #endif

         assert(waitq_held(wq));

2012 }

2013

2014 void

 waitq_unlock(struct waitq *wq)

2016 {

         assert(waitq_held(wq));

2018 #if defined(__x86_64__)

2019 pltrace(TRUE);

2020 #endif

2021 waitq_lock_unlock(wq);

2022 }

2023

2024

2025 /**

2026 * clear the thread-related waitq state

2027 *

2028 * Conditions:

2029 * 'thread' is locked

2030 */

2031 static inline void

2032 thread_clear_waitq_state(thread_t thread)

2033 {

2034 thread->waitq = NULL;

2035 thread->wait_event = NO_EVENT64;

2036 thread->at_safe_point = FALSE;

2037 }

2038

2039

 typedef thread_t (*waitq_select_cb)(void *ctx, struct waitq *waitq,

2041 int is_global, thread_t thread);

2042

2043 struct waitq_select_args {

2044 /* input parameters */

2045 struct waitq *posted_waitq;

2046 struct waitq *waitq;

2047 event64_t event;

2048 waitq_select_cb select_cb;

2049 void *select_ctx;

2050 int priority;

2051

2052 uint64_t *reserved_preposts;

2053

2054 /* output parameters */

2055 queue_t threadq;

2056 int max_threads;

2057 int *nthreads;

2058 spl_t *spl;

2059 };

2060

 static void do_waitq_select_n_locked(struct waitq_select_args *args);

2062

2063 /**

2064 * callback invoked once for every waitq set to which a waitq belongs

2065 *

2066 * Conditions:

2067 * ctx->posted_waitq is locked

2068 * 'link' points to a valid waitq set

2069 *

2070 * Notes:

2071 * Takes the waitq set lock on the set pointed to by 'link'

2072 * Calls do_waitq_select_n_locked() which could recurse back into

2073 * this function if the waitq set is a member of other sets.

2074 * If no threads were selected, it preposts the input waitq

2075 * onto the waitq set pointed to by 'link'.

2076 */

2077 static int

 waitq_select_walk_cb(struct waitq *waitq, void *ctx,

2079 struct waitq_link *link)

2080 {

2081 int ret = WQ_ITERATE_CONTINUE;

         struct waitq_select_args args = *((struct waitq_select_args *)ctx);

2083 struct waitq_set *wqset;

2084

2085 (void)waitq;

         assert(wql_type(link) == WQL_WQS);

2087

2088 wqset = link->wql_wqs.wql_set;

2089 args.waitq = &wqset->wqset_q;

2090

         assert(!waitq_irq_safe(waitq));

         assert(!waitq_irq_safe(&wqset->wqset_q));

2093

2094 waitq_set_lock(wqset);

2095 /*

2096 * verify that the link wasn't invalidated just before

2097 * we were able to take the lock.

2098 */

         if (wqset->wqset_id != link->wql_setid.id) {

2100 goto out_unlock;

2101 }

2102

         assert(waitqs_is_linked(wqset));

2104

2105 /*

2106 * Find any threads waiting on this wait queue set,

2107 * and recurse into any waitq set to which this set belongs.

2108 */

2109 do_waitq_select_n_locked(&args);

2110

         if (*args.nthreads > 0 || (args.threadq && !queue_empty(args.threadq))) {

2112 /* at least 1 thread was selected and returned: don't prepost */

                 if (args.max_threads > 0 && *args.nthreads >= args.max_threads) {

2114 /* break out of the setid walk */

2115 ret = WQ_ITERATE_FOUND;

2116 }

         } else if (args.event == NO_EVENT64) {

2118 /*

2119 * No thread selected: prepost 'waitq' to 'wqset'

2120 * if wqset can handle preposts and the event is set to 0.

2121 * We also make sure to not post waitq sets to other sets.

2122 *

2123 * If the set doesn't support preposts, but does support

2124 * prepost callout/hook interaction, invoke the predefined

2125 * callout function and pass the set's 'prepost_hook.' This

2126 * could potentially release another thread to handle events.

2127 */

                 if (waitq_set_can_prepost(wqset)) {

2129 wq_prepost_do_post_locked(

2130 wqset, waitq, args.reserved_preposts);

                 } else if (waitq_set_has_prepost_hook(wqset)) {

2132 waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;

2133

2134 /*

2135 * When calling out to the prepost hook,

2136 * we drop the waitq lock, to allow for the kevent

2137 * subsytem to call into the waitq subsystem again,

2138 * without risking a deadlock.

2139 *

2140 * However, we need to guard against wqset going away,

2141 * so we increment the prepost hook use count

2142 * while the lock is dropped.

2143 *

2144 * This lets waitq_set_deinit() know to wait for the

2145 * prepost hook call to be done before it can proceed.

2146 *

2147 * Note: we need to keep preemption disabled the whole

2148 * time as waitq_set_deinit will spin on this.

2149 */

2150

2151 disable_preemption();

                         os_atomic_add(hook, (uint16_t)1, relaxed);

2153 waitq_set_unlock(wqset);

2154

2155 waitq_set__CALLING_PREPOST_HOOK__(hook);

2156

2157 /* Note: after this decrement, the wqset may be deallocated */

                         os_atomic_add(hook, (uint16_t)-1, relaxed);

2159 enable_preemption();

2160 return ret;

2161 }

2162 }

2163

2164 out_unlock:

2165 waitq_set_unlock(wqset);

2166 return ret;

2167 }

2168

2169 /**

2170 * Routine to iterate over the waitq for non-priority ordered waitqs

2171 *

2172 * Conditions:

2173 * args->waitq (and args->posted_waitq) is locked

2174 *

2175 * Notes:

2176 * Uses the optional select callback function to refine the selection

2177 * of one or more threads from a waitq. The select callback is invoked

2178 * once for every thread that is found to be waiting on the input args->waitq.

2179 *

2180 * If one or more threads are selected, this may disable interrupts.

2181 * The previous interrupt state is returned in args->spl and should

2182 * be used in a call to splx() if threads are returned to the caller.

2183 */

2184 static thread_t

 waitq_queue_iterate_locked(struct waitq *safeq, struct waitq *waitq,

2186 spl_t spl, struct waitq_select_args *args,

2187 uint32_t *remaining_eventmask)

2188 {

2189 int max_threads = args->max_threads;

         int *nthreads = args->nthreads;

2191 thread_t thread = THREAD_NULL;

2192 thread_t first_thread = THREAD_NULL;

2193

         qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {

2195 thread_t t = THREAD_NULL;

2196 assert_thread_magic(thread);

2197

2198 /*

2199 * For non-priority ordered waitqs, we allow multiple events to be

2200 * mux'ed into the same waitq. Also safeqs may contain threads from

2201 * multiple waitqs. Only pick threads that match the

2202 * requested wait event.

2203 */

                 if (thread->waitq == waitq && thread->wait_event == args->event) {

2205 t = thread;

2206 if (first_thread == THREAD_NULL) {

2207 first_thread = thread;

2208 }

2209

2210 /* allow the caller to futher refine the selection */

2211 if (args->select_cb) {

                                 t = args->select_cb(args->select_ctx, waitq,

2213 waitq_is_global(waitq), thread);

2214 }

2215 if (t != THREAD_NULL) {

2216 *nthreads += 1;

2217 if (args->threadq) {

2218 /* if output queue, add locked thread to it */

                                         if (*nthreads == 1) {

                                                 *(args->spl) = (safeq != waitq) ? spl : splsched();

2221 }

2222 thread_lock(t);

2223 thread_clear_waitq_state(t);

                                         re_queue_tail(args->threadq, &t->wait_links);

2225 }

2226 /* only enqueue up to 'max' threads */

                                 if (*nthreads >= max_threads && max_threads > 0) {

2228 break;

2229 }

2230 }

2231 }

2232 /* thread wasn't selected so track it's event */

2233 if (t == THREAD_NULL) {

                         *remaining_eventmask |= (thread->waitq != safeq) ?

                             _CAST_TO_EVENT_MASK(thread->waitq) : _CAST_TO_EVENT_MASK(thread->wait_event);

2236 }

2237 }

2238

2239 return first_thread;

2240 }

2241

2242 /**

2243 * Routine to iterate and remove threads from priority ordered waitqs

2244 *

2245 * Conditions:

2246 * args->waitq (and args->posted_waitq) is locked

2247 *

2248 * Notes:

2249 * The priority ordered waitqs only support maximum priority element removal.

2250 *

2251 * Also, the implementation makes sure that all threads in a priority ordered

2252 * waitq are waiting on the same wait event. This is not necessarily true for

2253 * non-priority ordered waitqs. If one or more threads are selected, this may

2254 * disable interrupts. The previous interrupt state is returned in args->spl

2255 * and should be used in a call to splx() if threads are returned to the caller.

2256 *

2257 * In the future, we could support priority ordered waitqs with multiple wait

2258 * events in the same queue. The way to implement that would be to keep removing

2259 * elements from the waitq and if the event does not match the requested one,

2260 * add it to a local list. This local list of elements needs to be re-inserted

2261 * into the priority queue at the end and the select_cb return value &

2262 * remaining_eventmask would need to be handled appropriately. The implementation

2263 * is not very efficient but would work functionally.

2264 */

2265 static thread_t

 waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq,

2267 spl_t spl, struct waitq_select_args *args,

2268 uint32_t *remaining_eventmask)

2269 {

2270 int max_threads = args->max_threads;

         int *nthreads = args->nthreads;

2272 thread_t first_thread = THREAD_NULL;

2273 thread_t thread = THREAD_NULL;

2274

2275 /*

2276 * The waitq select routines need to handle two cases:

2277 * Case 1: Peek at maximum priority thread in the waitq (remove_op = 0)

2278 * Get the maximum priority thread from the waitq without removing it.

2279 * In that case args->threadq == NULL and max_threads == 1.

2280 * Case 2: Remove 'n' highest priority threads from waitq (remove_op = 1)

2281 * Get max_threads (if available) while removing them from the waitq.

2282 * In that case args->threadq != NULL and max_threads is one of {-1, 1}.

2283 *

2284 * The only possible values for remaining_eventmask for the priority queue

2285 * waitq are either 0 (for the remove all threads case) or the original

2286 * safeq->waitq_eventmask (for the lookup/remove one thread cases).

2287 */

2288 *remaining_eventmask = safeq->waitq_eventmask;

2289 boolean_t remove_op = !!(args->threadq);

2290

         while ((max_threads <= 0) || (*nthreads < max_threads)) {

                 if (priority_queue_empty(&(safeq->waitq_prio_queue))) {

2293 *remaining_eventmask = 0;

2294 break;

2295 }

2296

2297 if (remove_op) {

                         thread = priority_queue_remove_max(&safeq->waitq_prio_queue,

2299 struct thread, wait_prioq_links);

2300 } else {

2301 /* For the peek operation, the only valid value for max_threads is 1 */

                         assert(max_threads == 1);

                         thread = priority_queue_max(&safeq->waitq_prio_queue,

2304 struct thread, wait_prioq_links);

2305 }

2306 /*

2307 * Ensure the wait event matches since priority ordered waitqs do not

2308 * support multiple events in the same waitq.

2309 */

                 assert((thread->waitq == waitq) && (thread->wait_event == args->event));

2311

2312 if (args->select_cb) {

2313 /*

2314 * Call the select_cb passed into the waitq_select args. The callback

2315 * updates the select_ctx with information about the highest priority

2316 * thread which is eventually used by the caller.

2317 */

                         thread_t __assert_only ret_thread = args->select_cb(args->select_ctx, waitq,

2319 waitq_is_global(waitq), thread);

2320 if (!remove_op) {

2321 /* For the peek operation, the thread should not be selected for addition */

2322 assert(ret_thread == THREAD_NULL);

2323 } else {

2324 /*

2325 * For the remove operation, the select routine should always return a valid

2326 * thread for priority waitqs. Since all threads in a prioq are equally

2327 * eligible, it should match the thread removed from the prioq. If this

2328 * invariant changes, the implementation would need to handle the

2329 * remaining_eventmask here correctly.

2330 */

2331 assert(ret_thread == thread);

2332 }

2333 }

2334

2335 if (first_thread == THREAD_NULL) {

2336 first_thread = thread;

2337 /*

2338 * turnstile_kernel_update_inheritor_on_wake_locked will lock

2339 * first_thread, so call it before locking it.

2340 */

                         if (args->priority == WAITQ_PROMOTE_ON_WAKE && first_thread != THREAD_NULL && waitq_is_turnstile_queue(safeq)) {

                                 turnstile_kernel_update_inheritor_on_wake_locked(waitq_to_turnstile(safeq), (turnstile_inheritor_t)first_thread, TURNSTILE_INHERITOR_THREAD);

2343 }

2344 }

2345

2346 /* For the peek operation, break out early */

2347 if (!remove_op) {

2348 break;

2349 }

2350

2351 /* Add the thread to the result thread list */

2352 *nthreads += 1;

                 if (*nthreads == 1) {

                         *(args->spl) = (safeq != waitq) ? spl : splsched();

2355 }

2356 thread_lock(thread);

2357 thread_clear_waitq_state(thread);

                 enqueue_tail(args->threadq, &(thread->wait_links));

2359 }

2360

2361 return first_thread;

2362 }

2363

2364 /**

2365 * generic thread selection from a waitq (and sets to which the waitq belongs)

2366 *

2367 * Conditions:

2368 * args->waitq (and args->posted_waitq) is locked

2369 *

2370 * Notes:

2371 * Uses the optional select callback function to refine the selection

2372 * of one or more threads from a waitq and any set to which the waitq

2373 * belongs. The select callback is invoked once for every thread that

2374 * is found to be waiting on the input args->waitq.

2375 *

2376 * If one or more threads are selected, this may disable interrupts.

2377 * The previous interrupt state is returned in args->spl and should

2378 * be used in a call to splx() if threads are returned to the caller.

2379 */

2380 static void

 do_waitq_select_n_locked(struct waitq_select_args *args)

2382 {

         struct waitq *waitq = args->waitq;

2384 int max_threads = args->max_threads;

2385 thread_t first_thread = THREAD_NULL;

2386 struct waitq *safeq;

2387 uint32_t remaining_eventmask = 0;

2388 uint32_t eventmask;

         int *nthreads = args->nthreads;

2390 spl_t spl = 0;

2391

         assert(max_threads != 0);

2393

         if (!waitq_irq_safe(waitq)) {

2395 /* JMM - add flag to waitq to avoid global lookup if no waiters */

2396 eventmask = _CAST_TO_EVENT_MASK(waitq);

2397 safeq = waitq_get_safeq(waitq);

2398 if (safeq == NULL) {

2399 /*

2400 * in the WQT_TSPROXY case, if there's no turnstile,

2401 * there's no queue and no waiters, so we can move straight

2402 * to the waitq set recursion

2403 */

2404 goto handle_waitq_set;

2405 }

2406

                 if (*nthreads == 0) {

2408 spl = splsched();

2409 }

2410 waitq_lock(safeq);

2411 } else {

                 eventmask = _CAST_TO_EVENT_MASK(args->event);

2413 safeq = waitq;

2414 }

2415

2416 /*

2417 * If the safeq doesn't have an eventmask (not global) or the event

2418 * we're looking for IS set in its eventmask, then scan the threads

2419 * in that queue for ones that match the original <waitq,event> pair.

2420 */

         if (!waitq_is_global(safeq) ||

             (safeq->waitq_eventmask & eventmask) == eventmask) {

                 if (waitq_is_turnstile_queue(safeq)) {

                         first_thread = waitq_prioq_iterate_locked(safeq, waitq,

2425 spl, args,

2426 &remaining_eventmask);

2427 } else {

                         first_thread = waitq_queue_iterate_locked(safeq, waitq,

2429 spl, args,

2430 &remaining_eventmask);

2431 }

2432

2433 /*

2434 * Update the eventmask of global queues we just scanned:

2435 * - If we selected all the threads in the queue, we can clear its

2436 * eventmask.

2437 *

2438 * - If we didn't find enough threads to fill our needs, then we can

2439 * assume we looked at every thread in the queue and the mask we

2440 * computed is complete - so reset it.

2441 */

                 if (waitq_is_global(safeq)) {

                         if (waitq_empty(safeq)) {

2444 safeq->waitq_eventmask = 0;

                         } else if (max_threads < 0 || *nthreads < max_threads) {

2446 safeq->waitq_eventmask = remaining_eventmask;

2447 }

2448 }

2449 }

2450

2451 /*

2452 * Grab the first thread in the queue if no other thread was selected.

2453 * We can guarantee that no one has manipulated this thread because

2454 * it's waiting on the given waitq, and we have that waitq locked.

2455 */

         if (*nthreads == 0 && first_thread != THREAD_NULL && args->threadq) {

2457 /* we know this is the first (and only) thread */

2458 ++(*nthreads);

                 *(args->spl) = (safeq != waitq) ? spl : splsched();

2460

2461 thread_lock(first_thread);

2462 thread_clear_waitq_state(first_thread);

2463 waitq_thread_remove(safeq, first_thread);

                 enqueue_tail(args->threadq, &(first_thread->wait_links));

2465

2466 /* update the eventmask on [now] empty global queues */

                 if (waitq_is_global(safeq) && waitq_empty(safeq)) {

2468 safeq->waitq_eventmask = 0;

2469 }

2470 }

2471

2472 /* unlock the safe queue if we locked one above */

2473 if (safeq != waitq) {

2474 waitq_unlock(safeq);

                 if (*nthreads == 0) {

2476 splx(spl);

2477 }

2478 }

2479

         if (max_threads > 0 && *nthreads >= max_threads) {

2481 return;

2482 }

2483

2484 handle_waitq_set:

2485 /*

2486 * wait queues that are not in any sets

2487 * are the bottom of the recursion

2488 */

2489 if (!waitq->waitq_set_id) {

2490 return;

2491 }

2492

2493 /* check to see if the set ID for this wait queue is valid */

         struct waitq_link *link = wql_get_link(waitq->waitq_set_id);

2495 if (!link) {

2496 /* the waitq set to which this waitq belonged, has been invalidated */

2497 waitq->waitq_set_id = 0;

2498 return;

2499 }

2500

2501 wql_put_link(link);

2502

2503 /*

2504 * If this waitq is a member of any wait queue sets, we need to look

2505 * for waiting thread(s) in any of those sets, and prepost all sets that

2506 * don't have active waiters.

2507 *

2508 * Note that we do a local walk of this waitq's links - we manually

2509 * recurse down wait queue set's with non-zero wqset_q.waitq_set_id

2510 */

         (void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,

             WQL_WQS, (void *)args, waitq_select_walk_cb);

2513 }

2514

2515 /**

2516 * main entry point for thread selection from a waitq

2517 *

2518 * Conditions:

2519 * waitq is locked

2520 *

2521 * Returns:

2522 * The number of threads waiting on 'waitq' for 'event' which have

2523 * been placed onto the input 'threadq'

2524 *

2525 * Notes:

2526 * The 'select_cb' function is invoked for every thread found waiting on

2527 * 'waitq' for 'event'. The thread is _not_ locked upon callback

2528 * invocation. This parameter may be NULL.

2529 *

2530 * If one or more threads are returned in 'threadq' then the caller is

2531 * responsible to call splx() using the returned 'spl' value. Each

2532 * returned thread is locked.

2533 */

2534 static __inline__ int

 waitq_select_n_locked(struct waitq *waitq,

2536 event64_t event,

2537 waitq_select_cb select_cb,

2538 void *select_ctx,

2539 uint64_t *reserved_preposts,

2540 queue_t threadq,

2541 int max_threads, spl_t *spl,

2542 int priority)

2543 {

2544 int nthreads = 0;

2545

2546 struct waitq_select_args args = {

2547 .posted_waitq = waitq,

2548 .waitq = waitq,

2549 .event = event,

2550 .select_cb = select_cb,

2551 .select_ctx = select_ctx,

2552 .priority = priority,

2553 .reserved_preposts = reserved_preposts,

2554 .threadq = threadq,

2555 .max_threads = max_threads,

2556 .nthreads = &nthreads,

2557 .spl = spl,

2558 };

2559

2560 do_waitq_select_n_locked(&args);

2561 return nthreads;

2562 }

2563

2564 /**

2565 * select from a waitq a single thread waiting for a given event

2566 *

2567 * Conditions:

2568 * 'waitq' is locked

2569 *

2570 * Returns:

2571 * A locked thread that's been removed from the waitq, but has not

2572 * yet been put on a run queue. Caller is responsible to call splx

2573 * with the '*spl' value.

2574 */

2575 static thread_t

 waitq_select_one_locked(struct waitq *waitq, event64_t event,

2577 uint64_t *reserved_preposts,

2578 int priority, spl_t *spl)

2579 {

2580 int nthreads;

2581 queue_head_t threadq;

2582

2583 queue_init(&threadq);

2584

         nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,

             reserved_preposts, &threadq, 1, spl, priority);

2587

2588 /* if we selected a thread, return it (still locked) */

         if (!queue_empty(&threadq)) {

2590 thread_t t;

2591 queue_entry_t qe = dequeue_head(&threadq);

                 t = qe_element(qe, struct thread, wait_links);

                 assert(queue_empty(&threadq)); /* there should be 1 entry */

2594 /* t has been locked and removed from all queues */

2595 return t;

2596 }

2597

2598 return THREAD_NULL;

2599 }

2600

2601 struct select_thread_ctx {

2602 thread_t thread;

2603 event64_t event;

2604 spl_t *spl;

2605 };

2606

2607 /**

2608 * link walk callback invoked once for each set to which a waitq belongs

2609 *

2610 * Conditions:

2611 * initial waitq is locked

2612 * ctx->thread is unlocked

2613 *

2614 * Notes:

2615 * This may disable interrupts and early-out of the full DAG link walk by

2616 * returning KERN_ALREADY_IN_SET. In this case, the returned thread has

2617 * been removed from the waitq, it's waitq state has been reset, and the

2618 * caller is responsible to call splx() with the returned interrupt state

2619 * in ctx->spl.

2620 */

2621 static int

 waitq_select_thread_cb(struct waitq *waitq, void *ctx,

2623 struct waitq_link *link)

2624 {

         struct select_thread_ctx *stctx = (struct select_thread_ctx *)ctx;

2626 struct waitq_set *wqset;

2627 struct waitq *wqsetq;

2628 struct waitq *safeq;

2629 spl_t s;

2630

2631 (void)waitq;

2632

2633 thread_t thread = stctx->thread;

2634 event64_t event = stctx->event;

2635

         if (wql_type(link) != WQL_WQS) {

2637 return WQ_ITERATE_CONTINUE;

2638 }

2639

2640 wqset = link->wql_wqs.wql_set;

2641 wqsetq = &wqset->wqset_q;

2642

         assert(!waitq_irq_safe(waitq));

         assert(!waitq_irq_safe(wqsetq));

2645

2646 waitq_set_lock(wqset);

2647

2648 s = splsched();

2649

2650 /* find and lock the interrupt-safe waitq the thread is thought to be on */

2651 safeq = waitq_get_safeq(wqsetq);

2652 waitq_lock(safeq);

2653

2654 thread_lock(thread);

2655

         if ((thread->waitq == wqsetq) && (thread->wait_event == event)) {

2657 waitq_thread_remove(wqsetq, thread);

                 if (waitq_empty(safeq)) {

2659 safeq->waitq_eventmask = 0;

2660 }

2661 thread_clear_waitq_state(thread);

2662 waitq_unlock(safeq);

2663 waitq_set_unlock(wqset);

2664 /*

2665 * thread still locked,

2666 * return non-zero to break out of WQS walk

2667 */

2668 *(stctx->spl) = s;

2669 return WQ_ITERATE_FOUND;

2670 }

2671

2672 thread_unlock(thread);

2673 waitq_set_unlock(wqset);

2674 waitq_unlock(safeq);

2675 splx(s);

2676

2677 return WQ_ITERATE_CONTINUE;

2678 }

2679

2680 /**

2681 * returns KERN_SUCCESS and locks 'thread' if-and-only-if 'thread' is waiting

2682 * on 'waitq' (or any set to which waitq belongs) for 'event'

2683 *

2684 * Conditions:

2685 * 'waitq' is locked

2686 * 'thread' is unlocked

2687 */

2688 static kern_return_t

 waitq_select_thread_locked(struct waitq *waitq,

2690 event64_t event,

2691 thread_t thread, spl_t *spl)

2692 {

2693 struct waitq *safeq;

2694 struct waitq_link *link;

2695 struct select_thread_ctx ctx;

2696 kern_return_t kr;

2697 spl_t s;

2698

2699 /* Find and lock the interrupts disabled queue the thread is actually on */

         if (!waitq_irq_safe(waitq)) {

2701 safeq = waitq_get_safeq(waitq);

2702 if (safeq == NULL) {

2703 /*

2704 * in the WQT_TSPROXY case, if there's no turnstile,

2705 * there's no queue and no waiters, so we can move straight

2706 * to the waitq set recursion

2707 */

2708 goto handle_waitq_set;

2709 }

2710

2711 s = splsched();

2712 waitq_lock(safeq);

2713 } else {

2714 s = splsched();

2715 safeq = waitq;

2716 }

2717

2718 thread_lock(thread);

2719

         if ((thread->waitq == waitq) && (thread->wait_event == event)) {

2721 waitq_thread_remove(safeq, thread);

                 if (waitq_empty(safeq)) {

2723 safeq->waitq_eventmask = 0;

2724 }

2725 thread_clear_waitq_state(thread);

2726 *spl = s;

2727 /* thread still locked */

2728 return KERN_SUCCESS;

2729 }

2730

2731 thread_unlock(thread);

2732

2733 if (safeq != waitq) {

2734 waitq_unlock(safeq);

2735 }

2736

2737 splx(s);

2738

2739 handle_waitq_set:

2740 if (!waitq->waitq_set_id) {

2741 return KERN_NOT_WAITING;

2742 }

2743

2744 /* check to see if the set ID for this wait queue is valid */

         link = wql_get_link(waitq->waitq_set_id);

2746 if (!link) {

2747 /* the waitq to which this set belonged, has been invalidated */

2748 waitq->waitq_set_id = 0;

2749 return KERN_NOT_WAITING;

2750 }

2751

2752 /*

2753 * The thread may be waiting on a wait queue set to which

2754 * the input 'waitq' belongs. Go look for the thread in

2755 * all wait queue sets. If it's there, we'll remove it

2756 * because it's equivalent to waiting directly on the input waitq.

2757 */

2758 ctx.thread = thread;

2759 ctx.event = event;

2760 ctx.spl = spl;

         kr = walk_waitq_links(LINK_WALK_FULL_DAG, waitq, waitq->waitq_set_id,

             WQL_WQS, (void *)&ctx, waitq_select_thread_cb);

2763

2764 wql_put_link(link);

2765

2766 /* we found a thread, return success */

2767 if (kr == WQ_ITERATE_FOUND) {

2768 return KERN_SUCCESS;

2769 }

2770

2771 return KERN_NOT_WAITING;

2772 }

2773

2774 static int

 prepost_exists_cb(struct waitq_set __unused *wqset,

2776 void __unused *ctx,

2777 struct wq_prepost __unused *wqp,

2778 struct waitq __unused *waitq)

2779 {

2780 /* if we get here, then we know that there is a valid prepost object! */

2781 return WQ_ITERATE_FOUND;

2782 }

2783

2784 /**

2785 * declare a thread's intent to wait on 'waitq' for 'wait_event'

2786 *

2787 * Conditions:

2788 * 'waitq' is locked

2789 */

2790 wait_result_t

 waitq_assert_wait64_locked(struct waitq *waitq,

2792 event64_t wait_event,

2793 wait_interrupt_t interruptible,

2794 wait_timeout_urgency_t urgency,

2795 uint64_t deadline,

2796 uint64_t leeway,

2797 thread_t thread)

2798 {

2799 wait_result_t wait_result;

2800 int realtime = 0;

2801 struct waitq *safeq;

2802 uintptr_t eventmask;

2803 spl_t s;

2804

2805

2806 /*

2807 * Warning: Do _not_ place debugging print statements here.

2808 * The waitq is locked!

2809 */

         assert(!thread->started || thread == current_thread());

2811

         if (thread->waitq != NULL) {

                 panic("thread already waiting on %p", thread->waitq);

2814 }

2815

         if (waitq_is_set(waitq)) {

                 struct waitq_set *wqset = (struct waitq_set *)waitq;

2818 /*

2819 * early-out if the thread is waiting on a wait queue set

2820 * that has already been pre-posted.

2821 */

                 if (wait_event == NO_EVENT64 && waitq_set_maybe_preposted(wqset)) {

2823 int ret;

2824 /*

2825 * Run through the list of potential preposts. Because

2826 * this is a hot path, we short-circuit the iteration

2827 * if we find just one prepost object.

2828 */

                         ret = wq_prepost_foreach_locked(wqset, NULL,

2830 prepost_exists_cb);

2831 if (ret == WQ_ITERATE_FOUND) {

2832 s = splsched();

2833 thread_lock(thread);

2834 thread->wait_result = THREAD_AWAKENED;

2835 thread_unlock(thread);

2836 splx(s);

2837 return THREAD_AWAKENED;

2838 }

2839 }

2840 }

2841

2842 s = splsched();

2843

2844 /*

2845 * If already dealing with an irq safe wait queue, we are all set.

2846 * Otherwise, determine a global queue to use and lock it.

2847 */

         if (!waitq_irq_safe(waitq)) {

2849 safeq = waitq_get_safeq(waitq);

                 if (__improbable(safeq == NULL)) {

2851 panic("Trying to assert_wait on a turnstile proxy "

                             "that hasn't been donated one (waitq: %p)", waitq);

2853 }

2854 eventmask = _CAST_TO_EVENT_MASK(waitq);

2855 waitq_lock(safeq);

2856 } else {

2857 safeq = waitq;

2858 eventmask = _CAST_TO_EVENT_MASK(wait_event);

2859 }

2860

2861 /* lock the thread now that we have the irq-safe waitq locked */

2862 thread_lock(thread);

2863

2864 /*

2865 * Realtime threads get priority for wait queue placements.

2866 * This allows wait_queue_wakeup_one to prefer a waiting

2867 * realtime thread, similar in principle to performing

2868 * a wait_queue_wakeup_all and allowing scheduler prioritization

2869 * to run the realtime thread, but without causing the

2870 * lock contention of that scenario.

2871 */

         if (thread->sched_pri >= BASEPRI_REALTIME) {

2873 realtime = 1;

2874 }

2875

2876 /*

2877 * This is the extent to which we currently take scheduling attributes

2878 * into account. If the thread is vm priviledged, we stick it at

2879 * the front of the queue. Later, these queues will honor the policy

2880 * value set at waitq_init time.

2881 */

         wait_result = thread_mark_wait_locked(thread, interruptible);

2883 /* thread->wait_result has been set */

2884 if (wait_result == THREAD_WAITING) {

2885 if (!safeq->waitq_fifo

                     || (thread->options & TH_OPT_VMPRIV) || realtime) {

                         waitq_thread_insert(safeq, thread, false);

2888 } else {

                         waitq_thread_insert(safeq, thread, true);

2890 }

2891

2892 /* mark the event and real waitq, even if enqueued on a global safeq */

2893 thread->wait_event = wait_event;

2894 thread->waitq = waitq;

2895

                 if (deadline != 0) {

2897 boolean_t act;

2898

                         act = timer_call_enter_with_leeway(&thread->wait_timer,

2900 NULL,

2901 deadline, leeway,

2902 urgency, FALSE);

2903 if (!act) {

2904 thread->wait_timer_active++;

2905 }

2906 thread->wait_timer_is_set = TRUE;

2907 }

2908

                 if (waitq_is_global(safeq)) {

2910 safeq->waitq_eventmask |= eventmask;

2911 }

2912

2913 waitq_stats_count_wait(waitq);

2914 }

2915

2916 /* unlock the thread */

2917 thread_unlock(thread);

2918

2919 /* update the inheritor's thread priority if the waitq is embedded in turnstile */

         if (waitq_is_turnstile_queue(safeq) && wait_result == THREAD_WAITING) {

                 turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));

                 turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));

2923 }

2924

2925 /* unlock the safeq if we locked it here */

2926 if (safeq != waitq) {

2927 waitq_unlock(safeq);

2928 }

2929

2930 splx(s);

2931

2932 return wait_result;

2933 }

2934

2935 /**

2936 * remove 'thread' from its current blocking state on 'waitq'

2937 *

2938 * Conditions:

2939 * 'thread' is locked

2940 *

2941 * Notes:

2942 * This function is primarily used by clear_wait_internal in

2943 * sched_prim.c from the thread timer wakeup path

2944 * (i.e. the thread was waiting on 'waitq' with a timeout that expired)

2945 */

2946 int

 waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)

2948 {

2949 struct waitq *safeq;

2950

2951 assert_thread_magic(thread);

         assert(thread->waitq == waitq);

2953

2954 /* Find the interrupts disabled queue thread is waiting on */

         if (!waitq_irq_safe(waitq)) {

2956 safeq = waitq_get_safeq(waitq);

                 if (__improbable(safeq == NULL)) {

2958 panic("Trying to clear_wait on a turnstile proxy "

                             "that hasn't been donated one (waitq: %p)", waitq);

2960 }

2961 } else {

2962 safeq = waitq;

2963 }

2964

2965 /* thread is already locked so have to try for the waitq lock */

         if (!waitq_lock_try(safeq)) {

2967 return 0;

2968 }

2969

2970 waitq_thread_remove(safeq, thread);

2971 thread_clear_waitq_state(thread);

2972 waitq_stats_count_clear_wakeup(waitq);

2973

2974 /* clear the global event mask if this was the last thread there! */

         if (waitq_is_global(safeq) && waitq_empty(safeq)) {

2976 safeq->waitq_eventmask = 0;

2977 /* JMM - also mark no-waiters on waitq (if not the same as the safeq) */

2978 }

2979

2980 waitq_unlock(safeq);

2981

2982 return 1;

2983 }

2984

2985

2986 static __inline__

2987 void

2988 maybe_adjust_thread_pri(thread_t thread,

2989 int priority,

2990 __kdebug_only struct waitq *waitq)

2991 {

2992 /*

2993 * If the caller is requesting the waitq subsystem to promote the

2994 * priority of the awoken thread, then boost the thread's priority to

2995 * the default WAITQ_BOOST_PRIORITY (if it's not already equal or

2996 * higher priority). This boost must be removed via a call to

2997 * waitq_clear_promotion_locked before the thread waits again.

2998 *

2999 * WAITQ_PROMOTE_PRIORITY is -2.

3000 * Anything above 0 represents a mutex promotion.

3001 * The default 'no action' value is -1.

3002 * TODO: define this in a header

3003 */

3004 if (priority == WAITQ_PROMOTE_PRIORITY) {

3005 uintptr_t trace_waitq = 0;

                 if (__improbable(kdebug_enable)) {

3007 trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq);

3008 }

3009

                 sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);

3011 }

3012 }

3013

3014 /*

3015 * Clear a potential thread priority promotion from a waitq wakeup

3016 * with WAITQ_PROMOTE_PRIORITY.

3017 *

3018 * This must be called on the thread which was woken up with TH_SFLAG_WAITQ_PROMOTED.

3019 */

3020 void

 waitq_clear_promotion_locked(struct waitq *waitq, thread_t thread)

3022 {

3023 spl_t s;

3024

         assert(waitq_held(waitq));

3026 assert(thread != THREAD_NULL);

         assert(thread == current_thread());

3028

3029 /* This flag is only cleared by the thread itself, so safe to check outside lock */

         if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED) {

3031 return;

3032 }

3033

         if (!waitq_irq_safe(waitq)) {

3035 s = splsched();

3036 }

3037 thread_lock(thread);

3038

         sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, 0);

3040

3041 thread_unlock(thread);

         if (!waitq_irq_safe(waitq)) {

3043 splx(s);

3044 }

3045 }

3046

3047 /**

3048 * wakeup all threads waiting on 'waitq' for 'wake_event'

3049 *

3050 * Conditions:

3051 * 'waitq' is locked

3052 *

3053 * Notes:

3054 * May temporarily disable and re-enable interrupts

3055 * and re-adjust thread priority of each awoken thread.

3056 *

3057 * If the input 'lock_state' == WAITQ_UNLOCK then the waitq will have

3058 * been unlocked before calling thread_go() on any returned threads, and

3059 * is guaranteed to be unlocked upon function return.

3060 */

3061 kern_return_t

 waitq_wakeup64_all_locked(struct waitq *waitq,

3063 event64_t wake_event,

3064 wait_result_t result,

3065 uint64_t *reserved_preposts,

3066 int priority,

3067 waitq_lock_state_t lock_state)

3068 {

3069 kern_return_t ret;

3070 thread_t thread;

3071 spl_t th_spl;

3072 int nthreads;

3073 queue_head_t wakeup_queue;

3074

         assert(waitq_held(waitq));

3076 queue_init(&wakeup_queue);

3077

         nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,

3079 reserved_preposts,

             &wakeup_queue, -1, &th_spl, priority);

3081

3082 /* set each thread running */

3083 ret = KERN_NOT_WAITING;

3084

3085 #if CONFIG_WAITQ_STATS

         qe_foreach_element(thread, &wakeup_queue, wait_links)

3087 waitq_stats_count_wakeup(waitq);

3088 #endif

3089 if (lock_state == WAITQ_UNLOCK) {

3090 waitq_unlock(waitq);

3091 }

3092

         qe_foreach_element_safe(thread, &wakeup_queue, wait_links) {

3094 assert_thread_magic(thread);

3095 remqueue(&thread->wait_links);

                 maybe_adjust_thread_pri(thread, priority, waitq);

                 ret = thread_go(thread, result, WQ_OPTION_NONE);

3098 assert(ret == KERN_SUCCESS);

3099 thread_unlock(thread);

3100 }

         if (nthreads > 0) {

3102 splx(th_spl);

3103 } else {

3104 waitq_stats_count_fail(waitq);

3105 }

3106

3107 return ret;

3108 }

3109

3110 /**

3111 * wakeup one thread waiting on 'waitq' for 'wake_event'

3112 *

3113 * Conditions:

3114 * 'waitq' is locked

3115 *

3116 * Notes:

3117 * May temporarily disable and re-enable interrupts.

3118 */

3119 kern_return_t

 waitq_wakeup64_one_locked(struct waitq *waitq,

3121 event64_t wake_event,

3122 wait_result_t result,

3123 uint64_t *reserved_preposts,

3124 int priority,

3125 waitq_lock_state_t lock_state,

3126 waitq_options_t option)

3127 {

3128 thread_t thread;

3129 spl_t th_spl;

3130

         assert(waitq_held(waitq));

3132

         thread = waitq_select_one_locked(waitq, wake_event,

3134 reserved_preposts,

3135 priority, &th_spl);

3136

3137 if (thread != THREAD_NULL) {

3138 waitq_stats_count_wakeup(waitq);

3139 } else {

3140 waitq_stats_count_fail(waitq);

3141 }

3142

3143 if (lock_state == WAITQ_UNLOCK) {

3144 waitq_unlock(waitq);

3145 }

3146

3147 if (thread != THREAD_NULL) {

                 maybe_adjust_thread_pri(thread, priority, waitq);

                 kern_return_t ret = thread_go(thread, result, option);

3150 assert(ret == KERN_SUCCESS);

3151 thread_unlock(thread);

3152 splx(th_spl);

3153 return ret;

3154 }

3155

3156 return KERN_NOT_WAITING;

3157 }

3158

3159 /**

3160 * wakeup one thread waiting on 'waitq' for 'wake_event'

3161 *

3162 * Conditions:

3163 * 'waitq' is locked

3164 *

3165 * Returns:

3166 * A locked, runnable thread.

3167 * If return value is non-NULL, interrupts have also

3168 * been disabled, and the caller is responsible to call

3169 * splx() with the returned '*spl' value.

3170 */

3171 thread_t

 waitq_wakeup64_identify_locked(struct waitq     *waitq,

3173 event64_t wake_event,

3174 wait_result_t result,

3175 spl_t *spl,

3176 uint64_t *reserved_preposts,

3177 int priority,

3178 waitq_lock_state_t lock_state)

3179 {

3180 thread_t thread;

3181

         assert(waitq_held(waitq));

3183

         thread = waitq_select_one_locked(waitq, wake_event,

3185 reserved_preposts,

3186 priority, spl);

3187

3188 if (thread != THREAD_NULL) {

3189 waitq_stats_count_wakeup(waitq);

3190 } else {

3191 waitq_stats_count_fail(waitq);

3192 }

3193

3194 if (lock_state == WAITQ_UNLOCK) {

3195 waitq_unlock(waitq);

3196 }

3197

3198 if (thread != THREAD_NULL) {

3199 kern_return_t __assert_only ret;

                 ret = thread_go(thread, result, WQ_OPTION_NONE);

3201 assert(ret == KERN_SUCCESS);

3202 }

3203

3204 return thread; /* locked if not NULL (caller responsible for spl) */

3205 }

3206

3207 /**

3208 * wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'

3209 *

3210 * Conditions:

3211 * 'waitq' is locked

3212 * 'thread' is unlocked

3213 *

3214 * Notes:

3215 * May temporarily disable and re-enable interrupts

3216 *

3217 * If the input lock_state == WAITQ_UNLOCK then the waitq will have been

3218 * unlocked before calling thread_go() if 'thread' is to be awoken, and

3219 * is guaranteed to be unlocked upon function return.

3220 */

3221 kern_return_t

 waitq_wakeup64_thread_locked(struct waitq *waitq,

3223 event64_t wake_event,

3224 thread_t thread,

3225 wait_result_t result,

3226 waitq_lock_state_t lock_state)

3227 {

3228 kern_return_t ret;

3229 spl_t th_spl;

3230

         assert(waitq_held(waitq));

3232 assert_thread_magic(thread);

3233

3234 /*

3235 * See if the thread was still waiting there. If so, it got

3236 * dequeued and returned locked.

3237 */

         ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);

3239

3240 if (ret == KERN_SUCCESS) {

3241 waitq_stats_count_wakeup(waitq);

3242 } else {

3243 waitq_stats_count_fail(waitq);

3244 }

3245

3246 if (lock_state == WAITQ_UNLOCK) {

3247 waitq_unlock(waitq);

3248 }

3249

3250 if (ret != KERN_SUCCESS) {

3251 return KERN_NOT_WAITING;

3252 }

3253

         ret = thread_go(thread, result, WQ_OPTION_NONE);

3255 assert(ret == KERN_SUCCESS);

3256 thread_unlock(thread);

3257 splx(th_spl);

3258

3259 return ret;

3260 }

3261

3262

3263

3264 /* ----------------------------------------------------------------------

3265 *

3266 * In-Kernel API

3267 *

3268 * ---------------------------------------------------------------------- */

3269

3270 /**

3271 * initialize a waitq object

3272 */

3273 kern_return_t

 waitq_init(struct waitq *waitq, int policy)

3275 {

3276 assert(waitq != NULL);

3277

3278 /* only FIFO and LIFO for now */

         if ((policy & SYNC_POLICY_FIXED_PRIORITY) != 0) {

3280 return KERN_INVALID_ARGUMENT;

3281 }

3282

         waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);

3284 waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);

3285 waitq->waitq_prepost = 0;

3286 if (policy & SYNC_POLICY_TURNSTILE_PROXY) {

3287 waitq->waitq_type = WQT_TSPROXY;

3288 } else {

3289 waitq->waitq_type = WQT_QUEUE;

3290 }

3291 waitq->waitq_turnstile = !!(policy & SYNC_POLICY_TURNSTILE);

3292 waitq->waitq_eventmask = 0;

3293

3294 waitq->waitq_set_id = 0;

3295 waitq->waitq_prepost_id = 0;

3296

3297 waitq_lock_init(waitq);

         if (waitq_is_turnstile_queue(waitq)) {

3299 /* For turnstile, initialize it as a priority queue */

3300 priority_queue_init(&waitq->waitq_prio_queue);

                 assert(waitq->waitq_fifo == 0);

         } else if (policy & SYNC_POLICY_TURNSTILE_PROXY) {

3303 waitq->waitq_ts = TURNSTILE_NULL;

3304 waitq->waitq_tspriv = NULL;

3305 } else {

3306 queue_init(&waitq->waitq_queue);

3307 }

3308

3309 waitq->waitq_isvalid = 1;

3310 return KERN_SUCCESS;

3311 }

3312

3313 struct wq_unlink_ctx {

3314 struct waitq *unlink_wq;

3315 struct waitq_set *unlink_wqset;

3316 };

3317

 static int waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx,

     struct wq_prepost *wqp, struct waitq *waitq);

3320

3321 /**

3322 * walk_waitq_links callback to invalidate 'link' parameter

3323 *

3324 * Conditions:

3325 * Called from walk_waitq_links.

3326 * Note that unlink other callbacks, this one make no assumptions about

3327 * the 'waitq' parameter, specifically it does not have to be locked or

3328 * even valid.

3329 */

3330 static int

 waitq_unlink_all_cb(struct waitq *waitq, void *ctx,

3332 struct waitq_link *link)

3333 {

3334 (void)waitq;

3335 (void)ctx;

         if (wql_type(link) == WQL_LINK && wql_is_valid(link)) {

3337 wql_invalidate(link);

3338 }

3339

         if (wql_type(link) == WQL_WQS) {

3341 struct waitq_set *wqset;

3342 struct wq_unlink_ctx ulctx;

3343

3344 /*

3345 * When destroying the waitq, take the time to clear out any

3346 * preposts it may have made. This could potentially save time

3347 * on the IPC send path which would otherwise have to iterate

3348 * over lots of dead port preposts.

3349 */

                 if (waitq->waitq_prepost_id == 0) {

3351 goto out;

3352 }

3353

3354 wqset = link->wql_wqs.wql_set;

3355 assert(wqset != NULL);

                 assert(!waitq_irq_safe(&wqset->wqset_q));

3357

3358 waitq_set_lock(wqset);

3359

                 if (!waitq_set_is_valid(wqset)) {

3361 /* someone raced us to teardown */

3362 goto out_unlock;

3363 }

                 if (!waitq_set_maybe_preposted(wqset)) {

3365 goto out_unlock;

3366 }

3367

3368 ulctx.unlink_wq = waitq;

3369 ulctx.unlink_wqset = wqset;

                 (void)wq_prepost_iterate(wqset->wqset_prepost_id, &ulctx,

3371 waitq_unlink_prepost_cb);

3372 out_unlock:

3373 waitq_set_unlock(wqset);

3374 }

3375

3376 out:

3377 return WQ_ITERATE_CONTINUE;

3378 }

3379

3380

3381 /**

3382 * cleanup any link/prepost table resources associated with a waitq

3383 */

3384 void

 waitq_deinit(struct waitq *waitq)

3386 {

3387 spl_t s;

3388

3389 assert(waitq);

         if (!waitq_is_valid(waitq)) {

3391 return;

3392 }

3393

         if (!waitq_is_queue(waitq) && !waitq_is_turnstile_proxy(waitq)) {

3395 return;

3396 }

3397

         if (waitq_irq_safe(waitq)) {

3399 s = splsched();

3400 }

3401 waitq_lock(waitq);

3402

         if (waitq_valid(waitq)) {

3404 waitq->waitq_isvalid = 0;

                 if (!waitq_irq_safe(waitq)) {

3406 waitq_unlink_all_unlock(waitq);

3407 /* waitq unlocked and set links deallocated */

3408 goto out;

3409 }

3410 }

3411

3412 waitq_unlock(waitq);

         if (waitq_irq_safe(waitq)) {

3414 splx(s);

3415 }

3416

3417 out:

3418 #if MACH_ASSERT

         if (waitq_is_turnstile_queue(waitq)) {

                 assert(priority_queue_empty(&waitq->waitq_prio_queue));

         } else if (waitq_is_turnstile_proxy(waitq)) {

                 assert(waitq->waitq_ts == TURNSTILE_NULL);

3423 } else {

                 assert(queue_empty(&waitq->waitq_queue));

3425 }

3426 #else

         (void)0;

3428 #endif // MACH_ASSERT

3429 }

3430

3431 void

 waitq_invalidate_locked(struct waitq *waitq)

3433 {

         assert(waitq_held(waitq));

         assert(waitq_is_valid(waitq));

3436 waitq->waitq_isvalid = 0;

3437 }

3438

3439 /**

3440 * invalidate the given wq_prepost object

3441 *

3442 * Conditions:

3443 * Called from wq_prepost_iterate (_not_ from wq_prepost_foreach_locked!)

3444 */

3445 static int

 wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,

3447 void __unused *ctx,

3448 struct wq_prepost *wqp,

3449 struct waitq __unused *waitq)

3450 {

         if (wqp_type(wqp) == WQP_POST) {

3452 wq_prepost_invalidate(wqp);

3453 }

3454 return WQ_ITERATE_CONTINUE;

3455 }

3456

3457

3458 /**

3459 * allocate and initialize a waitq set object

3460 *

3461 * Conditions:

3462 * may block

3463 *

3464 * Returns:

3465 * allocated / initialized waitq_set object.

3466 * the waits_set object returned does not have

3467 * a waitq_link associated.

3468 *

3469 * NULL on failure

3470 */

3471 struct waitq_set *

 waitq_set_alloc(int policy, waitq_set_prepost_hook_t *prepost_hook)

3473 {

3474 struct waitq_set *wqset;

3475

         wqset = (struct waitq_set *)zalloc(waitq_set_zone);

3477 if (!wqset) {

                 panic("Can't allocate a new waitq set from zone %p", waitq_set_zone);

3479 }

3480

3481 kern_return_t ret;

         ret = waitq_set_init(wqset, policy, NULL, prepost_hook);

3483 if (ret != KERN_SUCCESS) {

3484 zfree(waitq_set_zone, wqset);

3485 wqset = NULL;

3486 }

3487

3488 return wqset;

3489 }

3490

3491 /**

3492 * initialize a waitq set object

3493 *

3494 * if no 'reserved_link' object is passed

3495 * the waitq_link will be lazily allocated

3496 * on demand through waitq_set_lazy_init_link.

3497 */

3498 kern_return_t

 waitq_set_init(struct waitq_set *wqset,

     int policy, uint64_t *reserved_link,

3501 waitq_set_prepost_hook_t *prepost_hook)

3502 {

3503 struct waitq_link *link;

3504 kern_return_t ret;

3505

         memset(wqset, 0, sizeof(*wqset));

3507

         ret = waitq_init(&wqset->wqset_q, policy);

3509 if (ret != KERN_SUCCESS) {

3510 return ret;

3511 }

3512

3513 wqset->wqset_q.waitq_type = WQT_SET;

3514 if (policy & SYNC_POLICY_PREPOST) {

                 wqset->wqset_q.waitq_prepost = 1;

3516 wqset->wqset_prepost_id = 0;

3517 assert(prepost_hook == NULL);

3518 } else {

                 wqset->wqset_q.waitq_prepost = 0;

3520 wqset->wqset_prepost_hook = prepost_hook;

3521 }

3522

         if (reserved_link && *reserved_link != 0) {

                 link = wql_get_reserved(*reserved_link, WQL_WQS);

3525

3526 if (!link) {

                         panic("Can't allocate link object for waitq set: %p", wqset);

3528 }

3529

3530 /* always consume the caller's reference */

3531 *reserved_link = 0;

3532

3533 link->wql_wqs.wql_set = wqset;

3534 wql_mkvalid(link);

3535

                 wqset->wqset_id = link->wql_setid.id;

3537 wql_put_link(link);

3538 } else {

3539 /*

3540 * Lazy allocate the link only when an actual id is needed.

3541 */

3542 wqset->wqset_id = WQSET_NOT_LINKED;

3543 }

3544

3545 return KERN_SUCCESS;

3546 }

3547

3548 #if DEVELOPMENT || DEBUG

3549

3550 int

3551 sysctl_helper_waitq_set_nelem(void)

3552 {

3553 return ltable_nelem(&g_wqlinktable);

3554 }

3555

3556 #endif

3557

3558 /**

3559 * initialize a waitq set link.

3560 *

3561 * Conditions:

3562 * may block

3563 * locks and unlocks the waiq set lock

3564 *

3565 */

3566 void

 waitq_set_lazy_init_link(struct waitq_set *wqset)

3568 {

3569 struct waitq_link *link;

3570

         assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));

3572

3573 waitq_set_lock(wqset);

         if (!waitq_set_should_lazy_init_link(wqset)) {

3575 waitq_set_unlock(wqset);

3576 return;

3577 }

3578

         assert(wqset->wqset_id == WQSET_NOT_LINKED);

3580 waitq_set_unlock(wqset);

3581

3582 link = wql_alloc_link(WQL_WQS);

3583 if (!link) {

                 panic("Can't allocate link object for waitq set: %p", wqset);

3585 }

3586

3587 link->wql_wqs.wql_set = wqset;

3588

3589 waitq_set_lock(wqset);

         if (waitq_set_should_lazy_init_link(wqset)) {

3591 wql_mkvalid(link);

                 wqset->wqset_id = link->wql_setid.id;

3593 }

3594

         assert(wqset->wqset_id != 0);

         assert(wqset->wqset_id != WQSET_NOT_LINKED);

3597

3598 waitq_set_unlock(wqset);

3599

3600 wql_put_link(link);

3601

3602 return;

3603 }

3604

3605 /**

3606 * checks if a waitq set needs to be linked.

3607 *

3608 */

3609 boolean_t

 waitq_set_should_lazy_init_link(struct waitq_set *wqset)

3611 {

         if (waitqs_is_linked(wqset) || wqset->wqset_id == 0) {

3613 return FALSE;

3614 }

3615 return TRUE;

3616 }

3617

3618 /**

3619 * clear out / release any resources associated with a waitq set

3620 *

3621 * Conditions:

3622 * may block

3623 * Note:

3624 * This will render the waitq set invalid, and it must

3625 * be re-initialized with waitq_set_init before it can be used again

3626 */

3627 void

 waitq_set_deinit(struct waitq_set *wqset)

3629 {

3630 struct waitq_link *link = NULL;

3631 uint64_t set_id, prepost_id;

3632

         if (!waitqs_is_set(wqset)) {

                 panic("trying to de-initialize an invalid wqset @%p", wqset);

3635 }

3636

         assert(!waitq_irq_safe(&wqset->wqset_q));

3638

3639 waitq_set_lock(wqset);

3640

         if (waitq_set_has_prepost_hook(wqset)) {

3642 waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;

3643 /*

3644 * If the wqset_prepost_hook value is non 0,

3645 * then another core is currently posting to this waitq set

3646 * and we need for it to finish what it's doing.

3647 */

                 while (os_atomic_load(hook, relaxed) != 0) {

3649 waitq_set_unlock(wqset);

3650 delay(1);

3651 waitq_set_lock(wqset);

3652 }

3653 }

3654

3655 set_id = wqset->wqset_id;

3656

         if (waitqs_is_linked(wqset) || set_id == 0) {

3658 /* grab the set's link object */

3659 link = wql_get_link(set_id);

3660 if (link) {

3661 wql_invalidate(link);

3662 }

3663 /* someone raced us to deinit */

                 if (!link || wqset->wqset_id != set_id || set_id != link->wql_setid.id) {

3665 if (link) {

3666 wql_put_link(link);

3667 }

3668 waitq_set_unlock(wqset);

3669 return;

3670 }

3671

3672 /* the link should be a valid link object at this point */

                 assert(link != NULL && wql_type(link) == WQL_WQS);

3674

3675 wqset->wqset_id = 0;

3676 }

3677

3678 /*

3679 * This set may have a lot of preposts, or may have been a member of

3680 * many other sets. To minimize spinlock hold times, we clear out the

3681 * waitq set data structure under the lock-hold, but don't clear any

3682 * table objects. We keep handles to the prepost and set linkage

3683 * objects and free those outside the critical section.

3684 */

3685 prepost_id = 0;

         if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {

3687 assert(link != NULL);

3688 prepost_id = wqset->wqset_prepost_id;

3689 }

3690 /* else { TODO: notify kqueue subsystem? } */

3691 wqset->wqset_prepost_id = 0;

3692

         wqset->wqset_q.waitq_fifo = 0;

         wqset->wqset_q.waitq_prepost = 0;

         wqset->wqset_q.waitq_isvalid = 0;

3696

3697 /* don't clear the 'waitq_irq' bit: it's used in locking! */

         wqset->wqset_q.waitq_eventmask = 0;

3699

3700 waitq_unlink_all_unlock(&wqset->wqset_q);

3701 /* wqset->wqset_q unlocked and set links deallocated */

3702

3703

3704 if (link) {

3705 /*

3706 * walk_waitq_links may race with us for access to the waitq set.

3707 * If walk_waitq_links has a reference to the set, then we should wait

3708 * until the link's refcount goes to 1 (our reference) before we exit

3709 * this function. That way we ensure that the waitq set memory will

3710 * remain valid even though it's been cleared out.

3711 */

                 while (wql_refcnt(link) > 1) {

3713 delay(1);

3714 }

3715 wql_put_link(link);

3716 }

3717

3718 /* drop / unlink all the prepost table objects */

3719 /* JMM - can this happen before the delay? */

3720 if (prepost_id) {

                 (void)wq_prepost_iterate(prepost_id, NULL,

3722 wqset_clear_prepost_chain_cb);

3723 }

3724 }

3725

3726 /**

3727 * de-initialize and free an allocated waitq set object

3728 *

3729 * Conditions:

3730 * may block

3731 */

3732 kern_return_t

 waitq_set_free(struct waitq_set *wqset)

3734 {

3735 waitq_set_deinit(wqset);

3736

         memset(wqset, 0, sizeof(*wqset));

3738 zfree(waitq_set_zone, wqset);

3739

3740 return KERN_SUCCESS;

3741 }

3742

3743 #if DEVELOPMENT || DEBUG

3744 #if CONFIG_WAITQ_DEBUG

3745 /**

3746 * return the set ID of 'wqset'

3747 */

3748 uint64_t

 wqset_id(struct waitq_set *wqset)

3750 {

3751 if (!wqset) {

3752 return 0;

3753 }

3754

         assert(waitqs_is_set(wqset));

3756

         if (!waitqs_is_linked(wqset)) {

3758 waitq_set_lazy_init_link(wqset);

3759 }

3760

3761 return wqset->wqset_id;

3762 }

3763

3764 /**

3765 * returns a pointer to the waitq object embedded in 'wqset'

3766 */

3767 struct waitq *

 wqset_waitq(struct waitq_set *wqset)

3769 {

3770 if (!wqset) {

3771 return NULL;

3772 }

3773

         assert(waitqs_is_set(wqset));

3775

3776 return &wqset->wqset_q;

3777 }

3778 #endif /* CONFIG_WAITQ_DEBUG */

3779 #endif /* DEVELOPMENT || DEBUG */

3780

3781

3782 /**

3783 * clear all preposts originating from 'waitq'

3784 *

3785 * Conditions:

3786 * 'waitq' locked

3787 * may (rarely) spin waiting for another on-core thread to

3788 * release the last reference to the waitq's prepost link object

3789 *

3790 * NOTE:

3791 * If this function needs to spin, it will drop the waitq lock!

3792 * The return value of the function indicates whether or not this

3793 * happened: 1 == lock was dropped, 0 == lock held

3794 */

3795 int

 waitq_clear_prepost_locked(struct waitq *waitq)

3797 {

3798 struct wq_prepost *wqp;

3799 int dropped_lock = 0;

3800

         assert(!waitq_irq_safe(waitq));

3802

         if (waitq->waitq_prepost_id == 0) {

3804 return 0;

3805 }

3806

         wqp = wq_prepost_get(waitq->waitq_prepost_id);

3808 waitq->waitq_prepost_id = 0;

3809 if (wqp) {

                 uint64_t wqp_id = wqp->wqp_prepostid.id;

                 wqdbg_v("invalidate prepost 0x%llx (refcnt:%d)",

                     wqp->wqp_prepostid.id, wqp_refcnt(wqp));

3813 wq_prepost_invalidate(wqp);

                 while (wqp_refcnt(wqp) > 1) {

3815 /*

3816 * Some other thread must have raced us to grab a link

3817 * object reference before we invalidated it. This

3818 * means that they are probably trying to access the

3819 * waitq to which the prepost object points. We need

3820 * to wait here until the other thread drops their

3821 * reference. We know that no one else can get a

3822 * reference (the object has been invalidated), and

3823 * that prepost references are short-lived (dropped on

3824 * a call to wq_prepost_put). We also know that no one

3825 * blocks while holding a reference therefore the

3826 * other reference holder must be on-core. We'll just

3827 * sit and wait for the other reference to be dropped.

3828 */

3829 disable_preemption();

3830

3831 waitq_unlock(waitq);

3832 dropped_lock = 1;

3833 /*

3834 * don't yield here, just spin and assume the other

3835 * consumer is already on core...

3836 */

3837 delay(1);

3838

3839 waitq_lock(waitq);

3840

3841 enable_preemption();

3842 }

                 if (wqp_refcnt(wqp) > 0 && wqp->wqp_prepostid.id == wqp_id) {

3844 wq_prepost_put(wqp);

3845 }

3846 }

3847

3848 return dropped_lock;

3849 }

3850

3851 /**

3852 * clear all preposts originating from 'waitq'

3853 *

3854 * Conditions:

3855 * 'waitq' is not locked

3856 * may disable and re-enable interrupts

3857 */

3858 void

 waitq_clear_prepost(struct waitq *waitq)

3860 {

         assert(waitq_valid(waitq));

         assert(!waitq_irq_safe(waitq));

3863

3864 waitq_lock(waitq);

3865 /* it doesn't matter to us if the lock is dropped here */

         (void)waitq_clear_prepost_locked(waitq);

3867 waitq_unlock(waitq);

3868 }

3869

3870 /**

3871 * return a the waitq's prepost object ID (allocate if necessary)

3872 *

3873 * Conditions:

3874 * 'waitq' is unlocked

3875 */

3876 uint64_t

 waitq_get_prepost_id(struct waitq *waitq)

3878 {

3879 struct wq_prepost *wqp;

3880 uint64_t wqp_id = 0;

3881

         if (!waitq_valid(waitq)) {

3883 return 0;

3884 }

3885

         assert(!waitq_irq_safe(waitq));

3887

3888 waitq_lock(waitq);

3889

         if (!waitq_valid(waitq)) {

3891 goto out_unlock;

3892 }

3893

3894 if (waitq->waitq_prepost_id) {

3895 wqp_id = waitq->waitq_prepost_id;

3896 goto out_unlock;

3897 }

3898

3899 /* don't hold a spinlock while allocating a prepost object */

3900 waitq_unlock(waitq);

3901

         wqp = wq_prepost_alloc(WQP_WQ, 1);

3903 if (!wqp) {

3904 return 0;

3905 }

3906

3907 /* re-acquire the waitq lock */

3908 waitq_lock(waitq);

3909

         if (!waitq_valid(waitq)) {

3911 wq_prepost_put(wqp);

3912 wqp_id = 0;

3913 goto out_unlock;

3914 }

3915

3916 if (waitq->waitq_prepost_id) {

3917 /* we were beat by someone else */

3918 wq_prepost_put(wqp);

3919 wqp_id = waitq->waitq_prepost_id;

3920 goto out_unlock;

3921 }

3922

3923 wqp->wqp_wq.wqp_wq_ptr = waitq;

3924

3925 wqp_set_valid(wqp);

3926 wqp_id = wqp->wqp_prepostid.id;

3927 waitq->waitq_prepost_id = wqp_id;

3928

3929 wq_prepost_put(wqp);

3930

3931 out_unlock:

3932 waitq_unlock(waitq);

3933

3934 return wqp_id;

3935 }

3936

3937

3938 static int

 waitq_inset_cb(struct waitq *waitq, void *ctx, struct waitq_link *link)

3940 {

         uint64_t setid = *(uint64_t *)ctx;

         int wqltype = wql_type(link);

3943 (void)waitq;

         if (wqltype == WQL_WQS && link->wql_setid.id == setid) {

                 wqdbg_v("  waitq already in set 0x%llx", setid);

3946 return WQ_ITERATE_FOUND;

         } else if (wqltype == WQL_LINK) {

3948 /*

3949 * break out early if we see a link that points to the setid

3950 * in question. This saves us a step in the

3951 * iteration/recursion

3952 */

                 wqdbg_v("  waitq already in set 0x%llx (WQL_LINK)", setid);

                 if (link->wql_link.left_setid == setid ||

3955 link->wql_link.right_setid == setid) {

3956 return WQ_ITERATE_FOUND;

3957 }

3958 }

3959

3960 return WQ_ITERATE_CONTINUE;

3961 }

3962

3963 /**

3964 * determine if 'waitq' is a member of 'wqset'

3965 *

3966 * Conditions:

3967 * neither 'waitq' nor 'wqset' is not locked

3968 * may disable and re-enable interrupts while locking 'waitq'

3969 */

3970 boolean_t

 waitq_member(struct waitq *waitq, struct waitq_set *wqset)

3972 {

3973 kern_return_t kr = WQ_ITERATE_SUCCESS;

3974 uint64_t setid;

3975

         if (!waitq_valid(waitq)) {

                 panic("Invalid waitq: %p", waitq);

3978 }

         assert(!waitq_irq_safe(waitq));

3980

         if (!waitqs_is_set(wqset)) {

3982 return FALSE;

3983 }

3984

3985 waitq_lock(waitq);

3986

         if (!waitqs_is_linked(wqset)) {

3988 goto out_unlock;

3989 }

3990

3991 setid = wqset->wqset_id;

3992

3993 /* fast path: most waitqs are members of only 1 set */

         if (waitq->waitq_set_id == setid) {

3995 waitq_unlock(waitq);

3996 return TRUE;

3997 }

3998

3999 /* walk the link table and look for the Set ID of wqset */

         kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,

             WQL_ALL, (void *)&setid, waitq_inset_cb);

4002

4003 out_unlock:

4004 waitq_unlock(waitq);

4005 return kr == WQ_ITERATE_FOUND;

4006 }

4007

4008 /**

4009 * Returns true is the given waitq is a member of at least 1 set

4010 */

4011 boolean_t

 waitq_in_set(struct waitq *waitq)

4013 {

4014 struct waitq_link *link;

4015 boolean_t inset = FALSE;

4016

         if (waitq_irq_safe(waitq)) {

4018 return FALSE;

4019 }

4020

4021 waitq_lock(waitq);

4022

4023 if (!waitq->waitq_set_id) {

4024 goto out_unlock;

4025 }

4026

         link = wql_get_link(waitq->waitq_set_id);

4028 if (link) {

4029 /* if we get here, the waitq is in _at_least_one_ set */

4030 inset = TRUE;

4031 wql_put_link(link);

4032 } else {

4033 /* we can just optimize this for next time */

4034 waitq->waitq_set_id = 0;

4035 }

4036

4037 out_unlock:

4038 waitq_unlock(waitq);

4039 return inset;

4040 }

4041

4042

4043 /**

4044 * pre-allocate a waitq link structure from the link table

4045 *

4046 * Conditions:

4047 * 'waitq' is not locked

4048 * may (rarely) block if link table needs to grow

4049 */

4050 uint64_t

 waitq_link_reserve(struct waitq *waitq)

4052 {

4053 struct waitq_link *link;

4054 uint64_t reserved_id = 0;

4055

         assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));

4057

4058 /*

4059 * We've asserted that the caller can block, so we enforce a

4060 * minimum-free table element policy here.

4061 */

4062 wql_ensure_free_space();

4063

4064 (void)waitq;

4065 link = wql_alloc_link(LT_RESERVED);

4066 if (!link) {

4067 return 0;

4068 }

4069

4070 reserved_id = link->wql_setid.id;

4071

4072 return reserved_id;

4073 }

4074

4075 /**

4076 * release a pre-allocated waitq link structure

4077 */

4078 void

4079 waitq_link_release(uint64_t id)

4080 {

4081 struct waitq_link *link;

4082

         if (id == 0) {

4084 return;

4085 }

4086

         link = wql_get_reserved(id, WQL_LINK);

4088 if (!link) {

4089 return;

4090 }

4091

4092 /*

4093 * if we successfully got a link object, then we know

4094 * it's not been marked valid, and can be released with

4095 * a standard wql_put_link() which should free the element.

4096 */

4097 wql_put_link(link);

4098 #if CONFIG_LTABLE_STATS

4099 g_wqlinktable.nreserved_releases += 1;

4100 #endif

4101 }

4102

4103 /**

4104 * link 'waitq' to the set identified by 'setid' using the 'link' structure

4105 *

4106 * Conditions:

4107 * 'waitq' is locked

4108 * caller should have a reference to the 'link' object

4109 */

4110 static kern_return_t

 waitq_link_internal(struct waitq *waitq,

     uint64_t setid, struct waitq_link *link)

4113 {

4114 struct waitq_link *qlink;

4115 kern_return_t kr;

4116

         assert(waitq_held(waitq));

         assert(setid != 0);

4119 assert(setid != WQSET_NOT_LINKED);

4120

4121 /*

4122 * If the waitq_set_id field is empty, then this waitq is not

4123 * a member of any other set. All we have to do is update the

4124 * field.

4125 */

4126 if (!waitq->waitq_set_id) {

4127 waitq->waitq_set_id = setid;

4128 return KERN_SUCCESS;

4129 }

4130

         qlink = wql_get_link(waitq->waitq_set_id);

4132 if (!qlink) {

4133 /*

4134 * The set to which this wait queue belonged has been

4135 * destroyed / invalidated. We can re-use the waitq field.

4136 */

4137 waitq->waitq_set_id = setid;

4138 return KERN_SUCCESS;

4139 }

4140 wql_put_link(qlink);

4141

4142 /*

4143 * Check to see if it's already a member of the set.

4144 *

4145 * TODO: check for cycles!

4146 */

         kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,

             WQL_ALL, (void *)&setid, waitq_inset_cb);

4149 if (kr == WQ_ITERATE_FOUND) {

4150 return KERN_ALREADY_IN_SET;

4151 }

4152

4153 /*

4154 * This wait queue is a member of at least one set already,

4155 * and _not_ a member of the given set. Use our previously

4156 * allocated link object, and hook it up to the wait queue.

4157 * Note that it's possible that one or more of the wait queue sets to

4158 * which the wait queue belongs was invalidated before we allocated

4159 * this link object. That's OK because the next time we use that

4160 * object we'll just ignore it.

4161 */

4162 link->wql_link.left_setid = setid;

         link->wql_link.right_setid = waitq->waitq_set_id;

4164 wql_mkvalid(link);

4165

         waitq->waitq_set_id = link->wql_setid.id;

4167

4168 return KERN_SUCCESS;

4169 }

4170

4171 /**

4172 * link 'waitq' to 'wqset'

4173 *

4174 * Conditions:

4175 * if 'lock_state' contains WAITQ_SHOULD_LOCK, 'waitq' must be unlocked.

4176 * Otherwise, 'waitq' must be locked.

4177 *

4178 * may (rarely) block on link table allocation if the table has to grow,

4179 * and no 'reserved_link' object is passed.

4180 *

4181 * may block and acquire wqset lock if the wqset passed has no link.

4182 *

4183 * Notes:

4184 * The caller can guarantee that this function will never block by

4185 * - pre-allocating a link table object and passing its ID in 'reserved_link'

4186 * - and pre-allocating the waitq set link calling waitq_set_lazy_init_link.

4187 * It is not possible to provide a reserved_link without having also linked

4188 * the wqset.

4189 */

4190 kern_return_t

 waitq_link(struct waitq *waitq, struct waitq_set *wqset,

4192 waitq_lock_state_t lock_state, uint64_t *reserved_link)

4193 {

4194 kern_return_t kr;

4195 struct waitq_link *link;

4196 int should_lock = (lock_state == WAITQ_SHOULD_LOCK);

4197

         if (!waitq_valid(waitq) || waitq_irq_safe(waitq)) {

                 panic("Invalid waitq: %p", waitq);

4200 }

4201

         if (!waitqs_is_set(wqset)) {

4203 return KERN_INVALID_ARGUMENT;

4204 }

4205

         if (!reserved_link || *reserved_link == 0) {

                 if (!waitqs_is_linked(wqset)) {

4208 waitq_set_lazy_init_link(wqset);

4209 }

4210 }

4211

         wqdbg_v("Link waitq %p to wqset 0x%llx",

             (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);

4214

4215 /*

4216 * We _might_ need a new link object here, so we'll grab outside

4217 * the lock because the alloc call _might_ block.

4218 *

4219 * If the caller reserved a link beforehand, then wql_get_link

4220 * is guaranteed not to block because the caller holds an extra

4221 * reference to the link which, in turn, hold a reference to the

4222 * link table.

4223 */

         if (reserved_link && *reserved_link != 0) {

                 link = wql_get_reserved(*reserved_link, WQL_LINK);

4226 /* always consume the caller's reference */

4227 *reserved_link = 0;

4228 } else {

4229 link = wql_alloc_link(WQL_LINK);

4230 }

4231 if (!link) {

4232 return KERN_NO_SPACE;

4233 }

4234

4235 if (should_lock) {

4236 waitq_lock(waitq);

4237 }

4238

         kr = waitq_link_internal(waitq, wqset->wqset_id, link);

4240

4241 if (should_lock) {

4242 waitq_unlock(waitq);

4243 }

4244

4245 wql_put_link(link);

4246

4247 return kr;

4248 }

4249

4250 /**

4251 * helper: unlink 'waitq' from waitq set identified by 'setid'

4252 * this function also prunes invalid objects from the tree

4253 *

4254 * Conditions:

4255 * MUST be called from walk_waitq_links link table walk

4256 * 'waitq' is locked

4257 *

4258 * Notes:

4259 * This is a helper function which compresses the link table by culling

4260 * unused or unnecessary links. See comments below for different

4261 * scenarios.

4262 */

4263 static inline int

 waitq_maybe_remove_link(struct waitq *waitq,

4265 uint64_t setid,

4266 struct waitq_link *parent,

4267 struct waitq_link *left,

4268 struct waitq_link *right)

4269 {

         uint64_t *wq_setid = &waitq->waitq_set_id;

4271

4272 /*

4273 * There are two scenarios:

4274 *

4275 * Scenario 1:

4276 * --------------------------------------------------------------------

4277 * waitq->waitq_set_id == parent

4278 *

4279 * parent(LINK)

4280 * / \

4281 * / \

4282 * / \

4283 * L(LINK/WQS_l) R(LINK/WQS_r)

4284 *

4285 * In this scenario, we assert that the original waitq points to the

4286 * parent link we were passed in. If WQS_l (or WQS_r) is the waitq

4287 * set we're looking for, we can set the corresponding parent

4288 * link id (left or right) to 0. To compress the tree, we can reset the

4289 * waitq_set_id of the original waitq to point to the side of the

4290 * parent that is still valid. We then discard the parent link object.

4291 */

         if (*wq_setid == parent->wql_setid.id) {

4293 if (!left && !right) {

4294 /* completely invalid children */

4295 wql_invalidate(parent);

4296 wqdbg_v("S1, L+R");

4297 *wq_setid = 0;

4298 return WQ_ITERATE_INVALID;

                 } else if (!left || left->wql_setid.id == setid) {

4300 /*

4301 * left side matches we know it points either to the

4302 * WQS we're unlinking, or to an invalid object:

4303 * no need to invalidate it

4304 */

                         *wq_setid = right ? right->wql_setid.id : 0;

4306 wql_invalidate(parent);

4307 wqdbg_v("S1, L");

4308 return left ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;

                 } else if (!right || right->wql_setid.id == setid) {

4310 /*

4311 * if right side matches we know it points either to the

4312 * WQS we're unlinking, or to an invalid object:

4313 * no need to invalidate it

4314 */

                         *wq_setid = left ? left->wql_setid.id : 0;

4316 wql_invalidate(parent);

4317 wqdbg_v("S1, R");

4318 return right ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;

4319 }

4320 }

4321

4322 /*

4323 * the tree walk starts at the top-of-tree and moves down,

4324 * so these are safe asserts.

4325 */

         assert(left || right); /* one of them has to be valid at this point */

4327

4328 /*

4329 * Scenario 2:

4330 * --------------------------------------------------------------------

4331 * waitq->waitq_set_id == ... (OR parent)

4332 *

4333 * ...

4334 * |

4335 * parent

4336 * / \

4337 * / \

4338 * L(LINK) R(LINK)

4339 * /\ /\

4340 * / \ / \

4341 * / \ Rl(*) Rr(*)

4342 * Ll(WQS) Lr(WQS)

4343 *

4344 * In this scenario, a leaf node of either the left or right side

4345 * could be the wait queue set we're looking to unlink. We also handle

4346 * the case where one of these links is invalid. If a leaf node is

4347 * invalid or it's the set we're looking for, we can safely remove the

4348 * middle link (left or right) and point the parent link directly to

4349 * the remaining leaf node.

4350 */

         if (left && wql_type(left) == WQL_LINK) {

4352 uint64_t Ll, Lr;

4353 struct waitq_link *linkLl, *linkLr;

                 assert(left->wql_setid.id != setid);

4355 Ll = left->wql_link.left_setid;

4356 Lr = left->wql_link.right_setid;

4357 linkLl = wql_get_link(Ll);

4358 linkLr = wql_get_link(Lr);

4359 if (!linkLl && !linkLr) {

4360 /*

4361 * The left object points to two invalid objects!

4362 * We can invalidate the left w/o touching the parent.

4363 */

4364 wql_invalidate(left);

4365 wqdbg_v("S2, Ll+Lr");

4366 return WQ_ITERATE_INVALID;

                 } else if (!linkLl || Ll == setid) {

4368 /* Ll is invalid and/or the wait queue set we're looking for */

4369 parent->wql_link.left_setid = Lr;

4370 wql_invalidate(left);

4371 wql_put_link(linkLl);

4372 wql_put_link(linkLr);

4373 wqdbg_v("S2, Ll");

4374 return linkLl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;

                 } else if (!linkLr || Lr == setid) {

4376 /* Lr is invalid and/or the wait queue set we're looking for */

4377 parent->wql_link.left_setid = Ll;

4378 wql_invalidate(left);

4379 wql_put_link(linkLr);

4380 wql_put_link(linkLl);

4381 wqdbg_v("S2, Lr");

4382 return linkLr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;

4383 }

4384 wql_put_link(linkLl);

4385 wql_put_link(linkLr);

4386 }

4387

         if (right && wql_type(right) == WQL_LINK) {

4389 uint64_t Rl, Rr;

4390 struct waitq_link *linkRl, *linkRr;

                 assert(right->wql_setid.id != setid);

4392 Rl = right->wql_link.left_setid;

4393 Rr = right->wql_link.right_setid;

4394 linkRl = wql_get_link(Rl);

4395 linkRr = wql_get_link(Rr);

4396 if (!linkRl && !linkRr) {

4397 /*

4398 * The right object points to two invalid objects!

4399 * We can invalidate the right w/o touching the parent.

4400 */

4401 wql_invalidate(right);

4402 wqdbg_v("S2, Rl+Rr");

4403 return WQ_ITERATE_INVALID;

                 } else if (!linkRl || Rl == setid) {

4405 /* Rl is invalid and/or the wait queue set we're looking for */

4406 parent->wql_link.right_setid = Rr;

4407 wql_invalidate(right);

4408 wql_put_link(linkRl);

4409 wql_put_link(linkRr);

4410 wqdbg_v("S2, Rl");

4411 return linkRl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;

                 } else if (!linkRr || Rr == setid) {

4413 /* Rr is invalid and/or the wait queue set we're looking for */

4414 parent->wql_link.right_setid = Rl;

4415 wql_invalidate(right);

4416 wql_put_link(linkRl);

4417 wql_put_link(linkRr);

4418 wqdbg_v("S2, Rr");

4419 return linkRr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;

4420 }

4421 wql_put_link(linkRl);

4422 wql_put_link(linkRr);

4423 }

4424

4425 return WQ_ITERATE_CONTINUE;

4426 }

4427

4428 /**

4429 * link table walk callback that unlinks 'waitq' from 'ctx->setid'

4430 *

4431 * Conditions:

4432 * called from walk_waitq_links

4433 * 'waitq' is locked

4434 *

4435 * Notes:

4436 * uses waitq_maybe_remove_link() to compress the linktable and

4437 * perform the actual unlinking

4438 */

4439 static int

 waitq_unlink_cb(struct waitq *waitq, void *ctx,

4441 struct waitq_link *link)

4442 {

         uint64_t setid = *((uint64_t *)ctx);

4444 struct waitq_link *right, *left;

4445 int ret = 0;

4446

         if (wql_type(link) != WQL_LINK) {

4448 return WQ_ITERATE_CONTINUE;

4449 }

4450

4451 do {

                 left  = wql_get_link(link->wql_link.left_setid);

                 right = wql_get_link(link->wql_link.right_setid);

4454

                 ret = waitq_maybe_remove_link(waitq, setid, link, left, right);

4456

4457 wql_put_link(left);

4458 wql_put_link(right);

4459

                 if (!wql_is_valid(link)) {

4461 return WQ_ITERATE_INVALID;

4462 }

4463 /* A ret value of UNLINKED will break us out of table walk */

         } while (ret == WQ_ITERATE_INVALID);

4465

4466 return ret;

4467 }

4468

4469

4470 /**

4471 * undo/remove a prepost from 'ctx' (waitq) to 'wqset'

4472 *

4473 * Conditions:

4474 * Called from wq_prepost_foreach_locked OR wq_prepost_iterate

4475 * 'wqset' may be NULL

4476 * (ctx)->unlink_wqset is locked

4477 */

4478 static int

 waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx,

     struct wq_prepost *wqp, struct waitq *waitq)

4481 {

         struct wq_unlink_ctx *ulctx = (struct wq_unlink_ctx *)ctx;

4483

         if (waitq != ulctx->unlink_wq) {

4485 return WQ_ITERATE_CONTINUE;

4486 }

4487

         if (wqp_type(wqp) == WQP_WQ &&

             wqp->wqp_prepostid.id == ulctx->unlink_wqset->wqset_prepost_id) {

4490 /* this is the only prepost on this wait queue set */

                 wqdbg_v("unlink wqp (WQ) 0x%llx", wqp->wqp_prepostid.id);

                 ulctx->unlink_wqset->wqset_prepost_id = 0;

4493 return WQ_ITERATE_BREAK;

4494 }

4495

         assert(wqp_type(wqp) == WQP_POST);

4497

4498 /*

4499 * The prepost object 'wqp' points to a waitq which should no longer

4500 * be preposted to 'ulctx->unlink_wqset'. We can remove the prepost

4501 * object from the list and break out of the iteration. Using the

4502 * context object in this way allows this same callback function to be

4503 * used from both wq_prepost_foreach_locked and wq_prepost_iterate.

4504 */

         wq_prepost_remove(ulctx->unlink_wqset, wqp);

4506 return WQ_ITERATE_BREAK;

4507 }

4508

4509 /**

4510 * unlink 'waitq' from 'wqset'

4511 *

4512 * Conditions:

4513 * 'waitq' is locked

4514 * 'wqset' is _not_ locked

4515 * may (rarely) spin in prepost clear and drop/re-acquire 'waitq' lock

4516 * (see waitq_clear_prepost_locked)

4517 */

4518 static kern_return_t

 waitq_unlink_locked(struct waitq *waitq,

4520 struct waitq_set *wqset)

4521 {

4522 uint64_t setid;

4523 kern_return_t kr;

4524

         assert(!waitq_irq_safe(waitq));

4526

         if (waitq->waitq_set_id == 0) {

4528 /*

4529 * TODO:

4530 * it doesn't belong to anyone, and it has a prepost object?

4531 * This is an artifact of not cleaning up after kqueues when

4532 * they prepost into select sets...

4533 */

                 if (waitq->waitq_prepost_id != 0) {

                         (void)waitq_clear_prepost_locked(waitq);

4536 }

4537 return KERN_NOT_IN_SET;

4538 }

4539

         if (!waitqs_is_linked(wqset)) {

4541 /*

4542 * No link has been allocated for the wqset,

4543 * so no waitq could have been linked to it.

4544 */

4545 return KERN_NOT_IN_SET;

4546 }

4547

4548 setid = wqset->wqset_id;

4549

         if (waitq->waitq_set_id == setid) {

4551 waitq->waitq_set_id = 0;

4552 /*

4553 * This was the only set to which the waitq belonged: we can

4554 * safely release the waitq's prepost object. It doesn't

4555 * matter if this function drops and re-acquires the lock

4556 * because we're not manipulating waitq state any more.

4557 */

                 (void)waitq_clear_prepost_locked(waitq);

4559 return KERN_SUCCESS;

4560 }

4561

4562 /*

4563 * The waitq was a member of more that 1 set, so we need to

4564 * handle potentially compressing the link table, and

4565 * adjusting the waitq->waitq_set_id value.

4566 *

4567 * Note: we can't free the waitq's associated prepost object (if any)

4568 * because it may be in use by the one or more _other_ sets to

4569 * which this queue belongs.

4570 *

4571 * Note: This function only handles a single level of the queue linkage.

4572 * Removing a waitq from a set to which it does not directly

4573 * belong is undefined. For example, if a waitq belonged to set

4574 * A, and set A belonged to set B. You can't remove the waitq

4575 * from set B.

4576 */

         kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,

             WQL_LINK, (void *)&setid, waitq_unlink_cb);

4579

4580 if (kr == WQ_ITERATE_UNLINKED) {

4581 struct wq_unlink_ctx ulctx;

4582

4583 kr = KERN_SUCCESS; /* found it and dis-associated it */

4584

4585 /* don't look for preposts if it's not prepost-enabled */

                 if (!wqset->wqset_q.waitq_prepost) {

4587 goto out;

4588 }

4589

                 assert(!waitq_irq_safe(&wqset->wqset_q));

4591

4592 waitq_set_lock(wqset);

4593 /*

4594 * clear out any prepost from waitq into wqset

4595 * TODO: this could be more efficient than a linear search of

4596 * the waitq set's prepost list.

4597 */

4598 ulctx.unlink_wq = waitq;

4599 ulctx.unlink_wqset = wqset;

                 (void)wq_prepost_iterate(wqset->wqset_prepost_id, (void *)&ulctx,

4601 waitq_unlink_prepost_cb);

4602 waitq_set_unlock(wqset);

4603 } else {

4604 kr = KERN_NOT_IN_SET; /* waitq is _not_ associated with wqset */

4605 }

4606

4607 out:

4608 return kr;

4609 }

4610

4611 /**

4612 * unlink 'waitq' from 'wqset'

4613 *

4614 * Conditions:

4615 * neither 'waitq' nor 'wqset' is locked

4616 * may disable and re-enable interrupts

4617 * may (rarely) spin in prepost clear

4618 * (see waitq_clear_prepost_locked)

4619 */

4620 kern_return_t

 waitq_unlink(struct waitq *waitq, struct waitq_set *wqset)

4622 {

4623 kern_return_t kr = KERN_SUCCESS;

4624

         assert(waitqs_is_set(wqset));

4626

4627 /*

4628 * we allow the waitq to be invalid because the caller may be trying

4629 * to clear out old/dirty state

4630 */

         if (!waitq_valid(waitq)) {

4632 return KERN_INVALID_ARGUMENT;

4633 }

4634

         wqdbg_v("unlink waitq %p from set 0x%llx",

             (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);

4637

         assert(!waitq_irq_safe(waitq));

4639

4640 waitq_lock(waitq);

4641

         kr = waitq_unlink_locked(waitq, wqset);

4643

4644 waitq_unlock(waitq);

4645 return kr;

4646 }

4647

4648 /**

4649 * unlink a waitq from a waitq set, but reference the waitq by its prepost ID

4650 *

4651 * Conditions:

4652 * 'wqset' is unlocked

4653 * wqp_id may be valid or invalid

4654 */

4655 void

 waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset)

4657 {

4658 struct wq_prepost *wqp;

4659

4660 disable_preemption();

4661 wqp = wq_prepost_get(wqp_id);

4662 if (wqp) {

4663 struct waitq *wq;

4664

4665 wq = wqp->wqp_wq.wqp_wq_ptr;

4666

4667 /*

4668 * lock the waitq, then release our prepost ID reference, then

4669 * unlink the waitq from the wqset: this ensures that we don't

4670 * hold a prepost ID reference during the unlink, but we also

4671 * complete the unlink operation atomically to avoid a race

4672 * with waitq_unlink[_all].

4673 */

                 assert(!waitq_irq_safe(wq));

4675

4676 waitq_lock(wq);

4677 wq_prepost_put(wqp);

4678

                 if (!waitq_valid(wq)) {

4680 /* someone already tore down this waitq! */

4681 waitq_unlock(wq);

4682 enable_preemption();

4683 return;

4684 }

4685

4686 /* this _may_ drop the wq lock, but that's OK */

4687 waitq_unlink_locked(wq, wqset);

4688

4689 waitq_unlock(wq);

4690 }

4691 enable_preemption();

4692 return;

4693 }

4694

4695

4696 /**

4697 * reference and lock a waitq by its prepost ID

4698 *

4699 * Conditions:

4700 * wqp_id may be valid or invalid

4701 *

4702 * Returns:

4703 * a locked waitq if wqp_id was valid

4704 * NULL on failure

4705 */

4706 struct waitq *

4707 waitq_lock_by_prepost_id(uint64_t wqp_id)

4708 {

4709 struct waitq *wq = NULL;

4710 struct wq_prepost *wqp;

4711

4712 disable_preemption();

4713 wqp = wq_prepost_get(wqp_id);

4714 if (wqp) {

4715 wq = wqp->wqp_wq.wqp_wq_ptr;

4716

                 assert(!waitq_irq_safe(wq));

4718

4719 waitq_lock(wq);

4720 wq_prepost_put(wqp);

4721

                 if (!waitq_valid(wq)) {

4723 /* someone already tore down this waitq! */

4724 waitq_unlock(wq);

4725 enable_preemption();

4726 return NULL;

4727 }

4728 }

4729 enable_preemption();

4730 return wq;

4731 }

4732

4733

4734 /**

4735 * unlink 'waitq' from all sets to which it belongs

4736 *

4737 * Conditions:

4738 * 'waitq' is locked on entry

4739 * returns with waitq lock dropped

4740 *

4741 * Notes:

4742 * may (rarely) spin (see waitq_clear_prepost_locked)

4743 */

4744 kern_return_t

 waitq_unlink_all_unlock(struct waitq *waitq)

4746 {

4747 uint64_t old_set_id = 0;

         wqdbg_v("unlink waitq %p from all sets",

             (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq));

         assert(!waitq_irq_safe(waitq));

4751

4752 /* it's not a member of any sets */

         if (waitq->waitq_set_id == 0) {

4754 waitq_unlock(waitq);

4755 return KERN_SUCCESS;

4756 }

4757

4758 old_set_id = waitq->waitq_set_id;

4759 waitq->waitq_set_id = 0;

4760

4761 /*

4762 * invalidate the prepost entry for this waitq.

4763 * This may drop and re-acquire the waitq lock, but that's OK because

4764 * if it was added to another set and preposted to that set in the

4765 * time we drop the lock, the state will remain consistent.

4766 */

         (void)waitq_clear_prepost_locked(waitq);

4768

4769 waitq_unlock(waitq);

4770

4771 if (old_set_id) {

4772 /*

4773 * Walk the link table and invalidate each LINK object that

4774 * used to connect this waitq to one or more sets: this works

4775 * because WQL_LINK objects are private to each wait queue

4776 */

                 (void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, old_set_id,

4778 WQL_LINK, NULL, waitq_unlink_all_cb);

4779 }

4780

4781 return KERN_SUCCESS;

4782 }

4783

4784 /**

4785 * unlink 'waitq' from all sets to which it belongs

4786 *

4787 * Conditions:

4788 * 'waitq' is not locked

4789 * may disable and re-enable interrupts

4790 * may (rarely) spin

4791 * (see waitq_unlink_all_locked, waitq_clear_prepost_locked)

4792 */

4793 kern_return_t

 waitq_unlink_all(struct waitq *waitq)

4795 {

4796 kern_return_t kr = KERN_SUCCESS;

4797

         if (!waitq_valid(waitq)) {

                 panic("Invalid waitq: %p", waitq);

4800 }

4801

         assert(!waitq_irq_safe(waitq));

4803 waitq_lock(waitq);

         if (!waitq_valid(waitq)) {

4805 waitq_unlock(waitq);

4806 return KERN_SUCCESS;

4807 }

4808

4809 kr = waitq_unlink_all_unlock(waitq);

4810 /* waitq unlocked and set links deallocated */

4811

4812 return kr;

4813 }

4814

4815

4816 /**

4817 * unlink all waitqs from 'wqset'

4818 *

4819 * Conditions:

4820 * 'wqset' is locked on entry

4821 * 'wqset' is unlocked on exit and spl is restored

4822 *

4823 * Note:

4824 * may (rarely) spin/block (see waitq_clear_prepost_locked)

4825 */

4826 kern_return_t

 waitq_set_unlink_all_unlock(struct waitq_set *wqset)

4828 {

4829 struct waitq_link *link;

4830 uint64_t prepost_id;

4831

         wqdbg_v("unlink all queues from set 0x%llx", wqset->wqset_id);

4833

4834 /*

4835 * This operation does not require interaction with any of the set's

4836 * constituent wait queues. All we have to do is invalidate the SetID

4837 */

4838

         if (waitqs_is_linked(wqset)) {

4840 /* invalidate and re-alloc the link object first */

                 link = wql_get_link(wqset->wqset_id);

4842

4843 /* we may have raced with a waitq_set_deinit: handle this */

4844 if (!link) {

4845 waitq_set_unlock(wqset);

4846 return KERN_SUCCESS;

4847 }

4848

4849 wql_invalidate(link);

4850

4851 /* re-alloc the object to get a new generation ID */

4852 wql_realloc_link(link, WQL_WQS);

4853 link->wql_wqs.wql_set = wqset;

4854

                 wqset->wqset_id = link->wql_setid.id;

4856 wql_mkvalid(link);

4857 wql_put_link(link);

4858 }

4859

4860 /* clear any preposts attached to this set */

4861 prepost_id = 0;

         if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {

4863 prepost_id = wqset->wqset_prepost_id;

4864 }

4865 /* else { TODO: notify kqueue subsystem? } */

4866 wqset->wqset_prepost_id = 0;

4867

4868 /*

4869 * clear set linkage and prepost object associated with this set:

4870 * waitq sets may prepost to other sets if, for example, they are

4871 * associated with a kqueue which is in a select set.

4872 *

4873 * This releases all the set link objects

4874 * (links to other sets to which this set was previously added)

4875 */

4876 waitq_unlink_all_unlock(&wqset->wqset_q);

4877 /* wqset->wqset_q unlocked */

4878

4879 /* drop / unlink all the prepost table objects */

4880 if (prepost_id) {

                 (void)wq_prepost_iterate(prepost_id, NULL,

4882 wqset_clear_prepost_chain_cb);

4883 }

4884

4885 return KERN_SUCCESS;

4886 }

4887

4888 /**

4889 * unlink all waitqs from 'wqset'

4890 *

4891 * Conditions:

4892 * 'wqset' is not locked

4893 * may (rarely) spin/block (see waitq_clear_prepost_locked)

4894 */

4895 kern_return_t

 waitq_set_unlink_all(struct waitq_set *wqset)

4897 {

         assert(waitqs_is_set(wqset));

         assert(!waitq_irq_safe(&wqset->wqset_q));

4900

4901 waitq_set_lock(wqset);

4902 return waitq_set_unlink_all_unlock(wqset);

4903 /* wqset unlocked and set links and preposts deallocated */

4904 }

4905

4906 static int

 waitq_prepost_reserve_cb(struct waitq *waitq, void *ctx,

4908 struct waitq_link *link)

4909 {

         uint32_t *num = (uint32_t *)ctx;

4911 (void)waitq;

4912

4913 /*

4914 * In the worst case, we'll have to allocate 2 prepost objects

4915 * per waitq set (if the set was already preposted by another

4916 * waitq).

4917 */

         if (wql_type(link) == WQL_WQS) {

4919 /*

4920 * check to see if the associated waitq actually supports

4921 * preposting

4922 */

                 if (waitq_set_can_prepost(link->wql_wqs.wql_set)) {

4924 *num += 2;

4925 }

4926 }

4927 return WQ_ITERATE_CONTINUE;

4928 }

4929

4930 static int

 waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq,

     int *did_unlock, struct wq_prepost **wqp)

4933 {

4934 struct wq_prepost *tmp;

4935 struct wqp_cache *cache;

4936

4937 *did_unlock = 0;

4938

4939 /*

4940 * Before we unlock the waitq, check the per-processor prepost object

4941 * cache to see if there's enough there for us. If so, do the

4942 * allocation, keep the lock and save an entire iteration over the set

4943 * linkage!

4944 */

4945 if (waitq) {

4946 disable_preemption();

4947 cache = PERCPU_GET(wqp_cache);

                 if (nalloc <= (int)cache->avail) {

4949 goto do_alloc;

4950 }

4951 enable_preemption();

4952

4953 /* unlock the waitq to perform the allocation */

4954 *did_unlock = 1;

4955 waitq_unlock(waitq);

4956 }

4957

4958 do_alloc:

         tmp = wq_prepost_alloc(LT_RESERVED, nalloc);

4960 if (!tmp) {

                 panic("Couldn't reserve %d preposts for waitq @%p (wqp@%p)",

4962 nalloc, waitq, *wqp);

4963 }

4964 if (*wqp) {

4965 /* link the two lists */

4966 int __assert_only rc;

                 rc = wq_prepost_rlink(tmp, *wqp);

4968 assert(rc == nalloc);

4969 }

4970 *wqp = tmp;

4971

4972 /*

4973 * If the caller can block, then enforce a minimum-free table element

4974 * policy here. This helps ensure that we will have enough prepost

4975 * objects for callers such as selwakeup() that can be called with

4976 * spin locks held.

4977 */

         if (get_preemption_level() == 0) {

4979 wq_prepost_ensure_free_space();

4980 }

4981

4982 if (waitq) {

                 if (*did_unlock == 0) {

4984 /* decrement the preemption count if alloc from cache */

4985 enable_preemption();

4986 } else {

4987 /* otherwise: re-lock the waitq */

4988 waitq_lock(waitq);

4989 }

4990 }

4991

4992 return nalloc;

4993 }

4994

4995 static int

 waitq_count_prepost_reservation(struct waitq *waitq, int extra, int keep_locked)

4997 {

4998 int npreposts = 0;

4999

5000 /*

5001 * If the waitq is not currently part of a set, and we're not asked to

5002 * keep the waitq locked then we'll want to have 3 in reserve

5003 * just-in-case it becomes part of a set while we unlock and reserve.

5004 * We may need up to 1 object for the waitq, and 2 for the set.

5005 */

         if (waitq->waitq_set_id == 0) {

5007 npreposts = 3;

5008 } else {

5009 /* this queue has never been preposted before */

                 if (waitq->waitq_prepost_id == 0) {

5011 npreposts = 3;

5012 }

5013

5014 /*

5015 * Walk the set of table linkages associated with this waitq

5016 * and count the worst-case number of prepost objects that

5017 * may be needed during a wakeup_all. We can walk this without

5018 * locking each set along the way because the table-based IDs

5019 * disconnect us from the set pointers themselves, and the

5020 * table walking is careful to read the setid values only once.

5021 * Locking each set up the chain also doesn't guarantee that

5022 * their membership won't change between the time we unlock

5023 * that set and when we actually go to prepost, so our

5024 * situation is no worse than before and we've alleviated lock

5025 * contention on any sets to which this waitq belongs.

5026 */

                 (void)walk_waitq_links(LINK_WALK_FULL_DAG_UNLOCKED,

5028 waitq, waitq->waitq_set_id,

5029 WQL_WQS, (void *)&npreposts,

5030 waitq_prepost_reserve_cb);

5031 }

5032

         if (extra > 0) {

5034 npreposts += extra;

5035 }

5036

         if (npreposts == 0 && !keep_locked) {

5038 /*

5039 * If we get here, we were asked to reserve some prepost

5040 * objects for a waitq that's previously preposted, and is not

5041 * currently a member of any sets. We have also been

5042 * instructed to unlock the waitq when we're done. In this

5043 * case, we pre-allocated enough reserved objects to handle

5044 * the case where the waitq gets added to a single set when

5045 * the lock is released.

5046 */

5047 npreposts = 3;

5048 }

5049

5050 return npreposts;

5051 }

5052

5053

5054 /**

5055 * pre-allocate prepost objects for 'waitq'

5056 *

5057 * Conditions:

5058 * 'waitq' is not locked

5059 *

5060 * Returns:

5061 * panic on error

5062 *

5063 * 0 on success, '*reserved' is set to the head of a singly-linked

5064 * list of pre-allocated prepost objects.

5065 *

5066 * Notes:

5067 * If 'lock_state' is WAITQ_KEEP_LOCKED, this function performs the pre-allocation

5068 * atomically and returns 'waitq' locked.

5069 *

5070 * This function attempts to pre-allocate precisely enough prepost

5071 * objects based on the current set membership of 'waitq'. If the

5072 * operation is performed atomically, then the caller

5073 * is guaranteed to have enough pre-allocated prepost object to avoid

5074 * any (rare) blocking in the wakeup path.

5075 */

5076 uint64_t

 waitq_prepost_reserve(struct waitq *waitq, int extra,

5078 waitq_lock_state_t lock_state)

5079 {

5080 uint64_t reserved = 0;

         uint64_t prev_setid = 0, prev_prepostid = 0;

5082 struct wq_prepost *wqp = NULL;

         int nalloc = 0, npreposts = 0;

5084 int keep_locked = (lock_state == WAITQ_KEEP_LOCKED);

5085 int unlocked = 0;

5086

         wqdbg_v("Attempting to reserve prepost linkages for waitq %p (extra:%d)",

             (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), extra);

5089

         if (waitq == NULL && extra > 0) {

5091 /*

5092 * Simple prepost object allocation:

5093 * we'll add 2 more because the waitq might need an object,

5094 * and the set itself may need a new POST object in addition

5095 * to the number of preposts requested by the caller

5096 */

                 nalloc = waitq_alloc_prepost_reservation(extra + 2, NULL,

5098 &unlocked, &wqp);

                 assert(nalloc == extra + 2);

5100 return wqp->wqp_prepostid.id;

5101 }

5102

         assert(lock_state == WAITQ_KEEP_LOCKED || lock_state == WAITQ_UNLOCK);

5104

         assert(!waitq_irq_safe(waitq));

5106

5107 waitq_lock(waitq);

5108

5109 /* remember the set ID that we started with */

5110 prev_setid = waitq->waitq_set_id;

5111 prev_prepostid = waitq->waitq_prepost_id;

5112

5113 /*

5114 * If the waitq is not part of a set, and we're asked to

5115 * keep the set locked, then we don't have to reserve

5116 * anything!

5117 */

         if (prev_setid == 0 && keep_locked) {

5119 goto out;

5120 }

5121

         npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);

5123

5124 /* nothing for us to do! */

         if (npreposts == 0) {

5126 if (keep_locked) {

5127 goto out;

5128 }

5129 goto out_unlock;

5130 }

5131

5132 try_alloc:

5133 /* this _may_ unlock and relock the waitq! */

         nalloc = waitq_alloc_prepost_reservation(npreposts, waitq,

5135 &unlocked, &wqp);

5136

5137 if (!unlocked) {

5138 /* allocation held the waitq lock: we'd done! */

5139 if (keep_locked) {

5140 goto out;

5141 }

5142 goto out_unlock;

5143 }

5144

5145 /*

5146 * Before we return, if the allocation had to unlock the waitq, we

5147 * must check one more time to see if we have enough. If not, we'll

5148 * try to allocate the difference. If the caller requests it, we'll

5149 * also leave the waitq locked so that the use of the pre-allocated

5150 * prepost objects can be guaranteed to be enough if a wakeup_all is

5151 * performed before unlocking the waitq.

5152 */

5153

5154 /*

5155 * If the waitq is no longer associated with a set, or if the waitq's

5156 * set/prepostid has not changed since we first walked its linkage,

5157 * we're done.

5158 */

         if ((waitq->waitq_set_id == 0) ||

5160 (waitq->waitq_set_id == prev_setid &&

5161 waitq->waitq_prepost_id == prev_prepostid)) {

5162 if (keep_locked) {

5163 goto out;

5164 }

5165 goto out_unlock;

5166 }

5167

         npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);

5169

5170 if (npreposts > nalloc) {

5171 prev_setid = waitq->waitq_set_id;

5172 prev_prepostid = waitq->waitq_prepost_id;

5173 npreposts = npreposts - nalloc; /* only allocate the diff */

5174 goto try_alloc;

5175 }

5176

5177 if (keep_locked) {

5178 goto out;

5179 }

5180

5181 out_unlock:

5182 waitq_unlock(waitq);

5183 out:

5184 if (wqp) {

5185 reserved = wqp->wqp_prepostid.id;

5186 }

5187

5188 return reserved;

5189 }

5190

5191 /**

5192 * release a linked list of prepost objects allocated via _prepost_reserve

5193 *

5194 * Conditions:

5195 * may (rarely) spin waiting for prepost table growth memcpy

5196 */

5197 void

5198 waitq_prepost_release_reserve(uint64_t id)

5199 {

5200 struct wq_prepost *wqp;

5201

         wqdbg_v("releasing reserved preposts starting at: 0x%llx", id);

5203

5204 wqp = wq_prepost_rfirst(id);

5205 if (!wqp) {

5206 return;

5207 }

5208

5209 wq_prepost_release_rlist(wqp);

5210 }

5211

5212

5213 /**

5214 * clear all preposts from 'wqset'

5215 *

5216 * Conditions:

5217 * 'wqset' is not locked

5218 */

5219 void

 waitq_set_clear_preposts(struct waitq_set *wqset)

5221 {

5222 uint64_t prepost_id;

5223 spl_t spl;

5224

         assert(waitqs_is_set(wqset));

5226

         if (!wqset->wqset_q.waitq_prepost || !wqset->wqset_prepost_id) {

5228 return;

5229 }

5230

         wqdbg_v("Clearing all preposted queues on waitq_set: 0x%llx",

5232 wqset->wqset_id);

5233

         if (waitq_irq_safe(&wqset->wqset_q)) {

5235 spl = splsched();

5236 }

5237 waitq_set_lock(wqset);

5238 prepost_id = wqset->wqset_prepost_id;

5239 wqset->wqset_prepost_id = 0;

5240 waitq_set_unlock(wqset);

         if (waitq_irq_safe(&wqset->wqset_q)) {

5242 splx(spl);

5243 }

5244

5245 /* drop / unlink all the prepost table objects */

5246 if (prepost_id) {

                 (void)wq_prepost_iterate(prepost_id, NULL,

5248 wqset_clear_prepost_chain_cb);

5249 }

5250 }

5251

5252

5253 /* ----------------------------------------------------------------------

5254 *

5255 * Iteration: waitq -> sets / waitq_set -> preposts

5256 *

5257 * ---------------------------------------------------------------------- */

5258

5259 struct wq_it_ctx {

5260 void *input;

5261 void *ctx;

5262 waitq_iterator_t it;

5263 };

5264

5265 static int

 waitq_iterate_sets_cb(struct waitq *waitq, void *ctx,

5267 struct waitq_link *link)

5268 {

         struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx);

5270 struct waitq_set *wqset;

5271 int ret;

5272

5273 (void)waitq;

         assert(!waitq_irq_safe(waitq));

         assert(wql_type(link) == WQL_WQS);

5276

5277 /*

5278 * the waitq is locked, so we can just take the set lock

5279 * and call the iterator function

5280 */

5281 wqset = link->wql_wqs.wql_set;

5282 assert(wqset != NULL);

         assert(!waitq_irq_safe(&wqset->wqset_q));

5284 waitq_set_lock(wqset);

5285

         ret = wctx->it(wctx->ctx, (struct waitq *)wctx->input, wqset);

5287

5288 waitq_set_unlock(wqset);

5289 return ret;

5290 }

5291

5292 /**

5293 * call external iterator function for each prepost object in wqset

5294 *

5295 * Conditions:

5296 * Called from wq_prepost_foreach_locked

5297 * (wqset locked, waitq _not_ locked)

5298 */

5299 static int

 wqset_iterate_prepost_cb(struct waitq_set *wqset, void *ctx,

     struct wq_prepost *wqp, struct waitq *waitq)

5302 {

         struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx);

5304 uint64_t wqp_id;

5305 int ret;

5306

5307 (void)wqp;

5308

5309 /*

5310 * This is a bit tricky. The 'wqset' is locked, but the 'waitq' is not.

5311 * Taking the 'waitq' lock is a lock order violation, so we need to be

5312 * careful. We also must realize that we may have taken a reference to

5313 * the 'wqp' just as the associated waitq was being torn down (or

5314 * clearing all its preposts) - see waitq_clear_prepost_locked(). If

5315 * the 'wqp' is valid and we can get the waitq lock, then we are good

5316 * to go. If not, we need to back off, check that the 'wqp' hasn't

5317 * been invalidated, and try to re-take the locks.

5318 */

         assert(!waitq_irq_safe(waitq));

5320

         if (waitq_lock_try(waitq)) {

5322 goto call_iterator;

5323 }

5324

         if (!wqp_is_valid(wqp)) {

5326 return WQ_ITERATE_RESTART;

5327 }

5328

5329 /* We are passed a prepost object with a reference on it. If neither

5330 * the waitq set nor the waitq require interrupts disabled, then we

5331 * may block on the delay(1) call below. We can't hold a prepost

5332 * object reference while blocking, so we have to give that up as well

5333 * and re-acquire it when we come back.

5334 */

5335 wqp_id = wqp->wqp_prepostid.id;

5336 wq_prepost_put(wqp);

5337 waitq_set_unlock(wqset);

         wqdbg_v("dropped set:%p lock waiting for wqp:%p (0x%llx -> wq:%p)",

             wqset, wqp, wqp->wqp_prepostid.id, waitq);

5340 delay(1);

5341 waitq_set_lock(wqset);

5342 wqp = wq_prepost_get(wqp_id);

5343 if (!wqp) {

5344 /* someone cleared preposts while we slept! */

5345 return WQ_ITERATE_DROPPED;

5346 }

5347

5348 /*

5349 * TODO:

5350 * This differs slightly from the logic in ipc_mqueue.c:

5351 * ipc_mqueue_receive_on_thread(). There, if the waitq lock

5352 * can't be obtained, the prepost link is placed on the back of

5353 * the chain, and the iteration starts from the beginning. Here,

5354 * we just restart from the beginning.

5355 */

5356 return WQ_ITERATE_RESTART;

5357

5358 call_iterator:

         if (!wqp_is_valid(wqp)) {

5360 ret = WQ_ITERATE_RESTART;

5361 goto out_unlock;

5362 }

5363

5364 /* call the external callback */

         ret = wctx->it(wctx->ctx, waitq, wqset);

5366

5367 if (ret == WQ_ITERATE_BREAK_KEEP_LOCKED) {

5368 ret = WQ_ITERATE_BREAK;

5369 goto out;

5370 }

5371

5372 out_unlock:

5373 waitq_unlock(waitq);

5374 out:

5375 return ret;

5376 }

5377

5378 /**

5379 * iterator over all sets to which the given waitq has been linked

5380 *

5381 * Conditions:

5382 * 'waitq' is locked

5383 */

5384 int

 waitq_iterate_sets(struct waitq *waitq, void *ctx, waitq_iterator_t it)

5386 {

5387 int ret;

5388 struct wq_it_ctx wctx = {

                 .input = (void *)waitq,

5390 .ctx = ctx,

5391 .it = it,

5392 };

5393 if (!it || !waitq) {

5394 return KERN_INVALID_ARGUMENT;

5395 }

5396

         ret = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,

             WQL_WQS, (void *)&wctx, waitq_iterate_sets_cb);

5399 if (ret == WQ_ITERATE_CONTINUE) {

5400 ret = WQ_ITERATE_SUCCESS;

5401 }

5402 return ret;

5403 }

5404

5405 /**

5406 * iterator over all preposts in the given wqset

5407 *

5408 * Conditions:

5409 * 'wqset' is locked

5410 */

5411 int

 waitq_set_iterate_preposts(struct waitq_set *wqset,

5413 void *ctx, waitq_iterator_t it)

5414 {

5415 struct wq_it_ctx wctx = {

                 .input = (void *)wqset,

5417 .ctx = ctx,

5418 .it = it,

5419 };

5420 if (!it || !wqset) {

5421 return WQ_ITERATE_INVALID;

5422 }

5423

         assert(waitq_held(&wqset->wqset_q));

5425

         return wq_prepost_foreach_locked(wqset, (void *)&wctx,

5427 wqset_iterate_prepost_cb);

5428 }

5429

5430

5431 /* ----------------------------------------------------------------------

5432 *

5433 * Higher-level APIs

5434 *

5435 * ---------------------------------------------------------------------- */

5436

5437

5438 /**

5439 * declare a thread's intent to wait on 'waitq' for 'wait_event'

5440 *

5441 * Conditions:

5442 * 'waitq' is not locked

5443 */

5444 wait_result_t

 waitq_assert_wait64(struct waitq *waitq,

5446 event64_t wait_event,

5447 wait_interrupt_t interruptible,

5448 uint64_t deadline)

5449 {

5450 thread_t thread = current_thread();

5451 wait_result_t ret;

5452 spl_t s;

5453

         if (!waitq_valid(waitq)) {

                 panic("Invalid waitq: %p", waitq);

5456 }

5457

         if (waitq_irq_safe(waitq)) {

5459 s = splsched();

5460 }

5461

5462 waitq_lock(waitq);

         ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,

5464 TIMEOUT_URGENCY_SYS_NORMAL,

5465 deadline, TIMEOUT_NO_LEEWAY, thread);

5466 waitq_unlock(waitq);

5467

         if (waitq_irq_safe(waitq)) {

5469 splx(s);

5470 }

5471

5472 return ret;

5473 }

5474

5475 /**

5476 * declare a thread's intent to wait on 'waitq' for 'wait_event'

5477 *

5478 * Conditions:

5479 * 'waitq' is not locked

5480 * will disable and re-enable interrupts while locking current_thread()

5481 */

5482 wait_result_t

 waitq_assert_wait64_leeway(struct waitq *waitq,

5484 event64_t wait_event,

5485 wait_interrupt_t interruptible,

5486 wait_timeout_urgency_t urgency,

5487 uint64_t deadline,

5488 uint64_t leeway)

5489 {

5490 wait_result_t ret;

5491 thread_t thread = current_thread();

5492 spl_t s;

5493

         if (!waitq_valid(waitq)) {

                 panic("Invalid waitq: %p", waitq);

5496 }

5497

         if (waitq_irq_safe(waitq)) {

5499 s = splsched();

5500 }

5501

5502 waitq_lock(waitq);

         ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,

5504 urgency, deadline, leeway, thread);

5505 waitq_unlock(waitq);

5506

         if (waitq_irq_safe(waitq)) {

5508 splx(s);

5509 }

5510

5511 return ret;

5512 }

5513

5514 /**

5515 * wakeup a single thread from a waitq that's waiting for a given event

5516 *

5517 * Conditions:

5518 * 'waitq' is not locked

5519 * may (rarely) block if 'waitq' is non-global and a member of 1 or more sets

5520 * may disable and re-enable interrupts

5521 *

5522 * Notes:

5523 * will _not_ block if waitq is global (or not a member of any set)

5524 */

5525 kern_return_t

 waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,

5527 wait_result_t result, int priority)

5528 {

5529 kern_return_t kr;

5530 uint64_t reserved_preposts = 0;

5531 spl_t spl;

5532

         if (!waitq_valid(waitq)) {

                 panic("Invalid waitq: %p", waitq);

5535 }

5536

         if (!waitq_irq_safe(waitq)) {

5538 /* reserve preposts in addition to locking the waitq */

                 reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);

5540 } else {

5541 spl = splsched();

5542 waitq_lock(waitq);

5543 }

5544

5545 /* waitq is locked upon return */

         kr = waitq_wakeup64_one_locked(waitq, wake_event, result,

             &reserved_preposts, priority, WAITQ_UNLOCK, WQ_OPTION_NONE);

5548

         if (waitq_irq_safe(waitq)) {

5550 splx(spl);

5551 }

5552

5553 /* release any left-over prepost object (won't block/lock anything) */

5554 waitq_prepost_release_reserve(reserved_preposts);

5555

5556 return kr;

5557 }

5558

5559 /**

5560 * wakeup all threads from a waitq that are waiting for a given event

5561 *

5562 * Conditions:

5563 * 'waitq' is not locked

5564 * may (rarely) block if 'waitq' is non-global and a member of 1 or more sets

5565 * may disable and re-enable interrupts

5566 *

5567 * Notes:

5568 * will _not_ block if waitq is global (or not a member of any set)

5569 */

5570 kern_return_t

 waitq_wakeup64_all(struct waitq *waitq,

5572 event64_t wake_event,

5573 wait_result_t result,

5574 int priority)

5575 {

5576 kern_return_t ret;

5577 uint64_t reserved_preposts = 0;

5578 spl_t s;

5579

         if (!waitq_valid(waitq)) {

                 panic("Invalid waitq: %p", waitq);

5582 }

5583

         if (!waitq_irq_safe(waitq)) {

5585 /* reserve preposts in addition to locking waitq */

                 reserved_preposts = waitq_prepost_reserve(waitq, 0,

5587 WAITQ_KEEP_LOCKED);

5588 } else {

5589 s = splsched();

5590 waitq_lock(waitq);

5591 }

5592

         ret = waitq_wakeup64_all_locked(waitq, wake_event, result,

5594 &reserved_preposts, priority,

5595 WAITQ_UNLOCK);

5596

         if (waitq_irq_safe(waitq)) {

5598 splx(s);

5599 }

5600

5601 waitq_prepost_release_reserve(reserved_preposts);

5602

5603 return ret;

5604 }

5605

5606 /**

5607 * wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'

5608 *

5609 * Conditions:

5610 * 'waitq' is not locked

5611 *

5612 * Notes:

5613 * May temporarily disable and re-enable interrupts

5614 */

5615 kern_return_t

 waitq_wakeup64_thread(struct waitq *waitq,

5617 event64_t wake_event,

5618 thread_t thread,

5619 wait_result_t result)

5620 {

5621 kern_return_t ret;

5622 spl_t s, th_spl;

5623

         if (!waitq_valid(waitq)) {

                 panic("Invalid waitq: %p", waitq);

5626 }

5627

         if (waitq_irq_safe(waitq)) {

5629 s = splsched();

5630 }

5631 waitq_lock(waitq);

5632

         ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);

5634 /* on success, returns 'thread' locked */

5635

5636 waitq_unlock(waitq);

5637

5638 if (ret == KERN_SUCCESS) {

                 ret = thread_go(thread, result, WQ_OPTION_NONE);

5640 assert(ret == KERN_SUCCESS);

5641 thread_unlock(thread);

5642 splx(th_spl);

5643 waitq_stats_count_wakeup(waitq);

5644 } else {

5645 ret = KERN_NOT_WAITING;

5646 waitq_stats_count_fail(waitq);

5647 }

5648

         if (waitq_irq_safe(waitq)) {

5650 splx(s);

5651 }

5652

5653 return ret;

5654 }

5655

5656 /**

5657 * wakeup a single thread from a waitq that's waiting for a given event

5658 * and return a reference to that thread

5659 * returns THREAD_NULL if no thread was waiting

5660 *

5661 * Conditions:

5662 * 'waitq' is not locked

5663 * may (rarely) block if 'waitq' is non-global and a member of 1 or more sets

5664 * may disable and re-enable interrupts

5665 *

5666 * Notes:

5667 * will _not_ block if waitq is global (or not a member of any set)

5668 */

5669 thread_t

 waitq_wakeup64_identify(struct waitq    *waitq,

5671 event64_t wake_event,

5672 wait_result_t result,

5673 int priority)

5674 {

5675 uint64_t reserved_preposts = 0;

5676 spl_t thread_spl = 0;

5677 thread_t thread;

5678 spl_t spl;

5679

         if (!waitq_valid(waitq)) {

                 panic("Invalid waitq: %p", waitq);

5682 }

5683

         if (!waitq_irq_safe(waitq)) {

5685 /* reserve preposts in addition to locking waitq */

                 reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);

5687 } else {

5688 spl = splsched();

5689 waitq_lock(waitq);

5690 }

5691

         thread = waitq_wakeup64_identify_locked(waitq, wake_event, result,

5693 &thread_spl, &reserved_preposts,

5694 priority, WAITQ_UNLOCK);

5695 /* waitq is unlocked, thread is locked */

5696

5697 if (thread != THREAD_NULL) {

5698 thread_reference(thread);

5699 thread_unlock(thread);

5700 splx(thread_spl);

5701 }

5702

         if (waitq_irq_safe(waitq)) {

5704 splx(spl);

5705 }

5706

5707 /* release any left-over prepost object (won't block/lock anything) */

5708 waitq_prepost_release_reserve(reserved_preposts);

5709

5710 /* returns +1 ref to running thread or THREAD_NULL */

5711 return thread;

5712 }