-_os_once_gate_set_value_and_broadcast(os_once_gate_t og, os_lock_owner_t self,
- os_once_t value)
-{
- // The next barrier must be long and strong.
- //
- // The scenario: SMP systems with weakly ordered memory models
- // and aggressive out-of-order instruction execution.
- //
- // The problem:
- //
- // The os_once*() wrapper macro causes the callee's
- // instruction stream to look like this (pseudo-RISC):
- //
- // load r5, pred-addr
- // cmpi r5, -1
- // beq 1f
- // call os_once*()
- // 1f:
- // load r6, data-addr
- //
- // May be re-ordered like so:
- //
- // load r6, data-addr
- // load r5, pred-addr
- // cmpi r5, -1
- // beq 1f
- // call os_once*()
- // 1f:
- //
- // Normally, a barrier on the read side is used to workaround
- // the weakly ordered memory model. But barriers are expensive
- // and we only need to synchronize once! After func(ctxt)
- // completes, the predicate will be marked as "done" and the
- // branch predictor will correctly skip the call to
- // os_once*().
- //
- // A far faster alternative solution: Defeat the speculative
- // read-ahead of peer CPUs.
- //
- // Modern architectures will throw away speculative results
- // once a branch mis-prediction occurs. Therefore, if we can
- // ensure that the predicate is not marked as being complete
- // until long after the last store by func(ctxt), then we have
- // defeated the read-ahead of peer CPUs.
- //
- // In other words, the last "store" by func(ctxt) must complete
- // and then N cycles must elapse before ~0l is stored to *val.
- // The value of N is whatever is sufficient to defeat the
- // read-ahead mechanism of peer CPUs.
- //
- // On some CPUs, the most fully synchronizing instruction might
- // need to be issued.
- os_atomic_maximally_synchronizing_barrier();
-
- // above assumed to contain release barrier
- os_ulock_value_t current =
- (os_ulock_value_t)os_atomic_xchg(&og->ogo_once, value, relaxed);
- if (likely(current == self)) return;
- _os_once_gate_broadcast_slow(&og->ogo_lock, current, self);
+_os_once_callout(os_once_gate_t og, void *ctxt, os_function_t func,
+ os_lock_owner_t self)
+{
+ uintptr_t v;
+
+ func(ctxt);
+
+#if OS_ONCE_USE_QUIESCENT_COUNTER
+ v = _os_once_mark_quiescing(og);
+#else
+ v = _os_once_mark_done(og);
+#endif
+ if (likely((os_ulock_value_t)v == self)) return;
+ _os_once_gate_broadcast(og, (os_ulock_value_t)v, self);
+}
+
+OS_NOINLINE
+static void
+_os_once_gate_wait(os_once_gate_t og, void *ctxt, os_function_t func,
+ os_lock_owner_t self)
+{
+ uintptr_t old, new;
+
+ for (;;) {
+ os_atomic_rmw_loop(&og->ogo_once, old, new, relaxed, {
+ if (old == OS_ONCE_DONE) {
+ os_atomic_rmw_loop_give_up(return);
+#if OS_ONCE_USE_QUIESCENT_COUNTER
+ } else if (OS_ONCE_IS_GEN(old)) {
+ os_atomic_rmw_loop_give_up({
+ os_atomic_thread_fence(acquire);
+ return _os_once_mark_done_if_quiesced(og, old);
+ });
+#endif
+ } else if (old == OS_ONCE_INIT) {
+ // __os_once_reset was used, try to become the new initializer
+ new = (uintptr_t)self;
+ } else {
+ new = old & ~(uintptr_t)OS_ULOCK_NOWAITERS_BIT;
+ if (new == old) os_atomic_rmw_loop_give_up(break);
+ }
+ });
+ if (old == OS_ONCE_INIT) {
+ // see comment in _os_once, pairs with the release barrier
+ // in __os_once_reset()
+ os_atomic_thread_fence(acquire);
+ return _os_once_callout(og, ctxt, func, self);
+ }
+ if (unlikely(OS_ULOCK_IS_OWNER((os_lock_owner_t)old, self, 0))) {
+ return _os_once_gate_recursive_abort(self);
+ }
+ int ret = __ulock_wait(UL_UNFAIR_LOCK | ULF_NO_ERRNO,
+ &og->ogo_lock, (os_ulock_value_t)new, 0);
+ if (unlikely(ret < 0)) {
+ switch (-ret) {
+ case EINTR:
+ case EFAULT:
+ continue;
+ case EOWNERDEAD:
+ _os_once_gate_corruption_abort((os_lock_owner_t)old);
+ break;
+ default:
+ __LIBPLATFORM_INTERNAL_CRASH__(-ret, "ulock_wait failure");
+ }
+ }
+ }