]> git.saurik.com Git - apple/libdispatch.git/blobdiff - src/semaphore.c
libdispatch-913.30.4.tar.gz
[apple/libdispatch.git] / src / semaphore.c
index 9e36d4db19130ae2a9fb64ac7a0e348afc93e0fd..3fe94c6e3efda16a3e7bc70b74cc3bd8ac35d308 100644 (file)
 /*
- * Copyright (c) 2008-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2013 Apple Inc. All rights reserved.
  *
  * @APPLE_APACHE_LICENSE_HEADER_START@
- * 
+ *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- * 
+ *
  * @APPLE_APACHE_LICENSE_HEADER_END@
  */
 
 #include "internal.h"
 
-// semaphores are too fundamental to use the dispatch_assume*() macros
-#define DISPATCH_SEMAPHORE_VERIFY_KR(x)        do {    \
-               if (x) {        \
-                       DISPATCH_CRASH("flawed group/semaphore logic"); \
-               }       \
-       } while (0)
+DISPATCH_WEAK // rdar://problem/8503746
+long _dispatch_semaphore_signal_slow(dispatch_semaphore_t dsema);
+
+#pragma mark -
+#pragma mark dispatch_semaphore_class_t
 
-struct dispatch_semaphore_vtable_s {
-       DISPATCH_VTABLE_HEADER(dispatch_semaphore_s);
-};
+static void
+_dispatch_semaphore_class_init(long value, dispatch_semaphore_class_t dsemau)
+{
+       struct dispatch_semaphore_header_s *dsema = dsemau._dsema_hdr;
 
-static void _dispatch_semaphore_dispose(dispatch_semaphore_t dsema);
-static size_t _dispatch_semaphore_debug(dispatch_semaphore_t dsema, char *buf, size_t bufsiz);
-static long _dispatch_group_wake(dispatch_semaphore_t dsema);
+       dsema->do_next = DISPATCH_OBJECT_LISTLESS;
+       dsema->do_targetq = _dispatch_get_root_queue(DISPATCH_QOS_DEFAULT, false);
+       dsema->dsema_value = value;
+       _dispatch_sema4_init(&dsema->dsema_sema, _DSEMA4_POLICY_FIFO);
+}
 
-const struct dispatch_semaphore_vtable_s _dispatch_semaphore_vtable = {
-       .do_type = DISPATCH_SEMAPHORE_TYPE,
-       .do_kind = "semaphore",
-       .do_dispose = _dispatch_semaphore_dispose,
-       .do_debug = _dispatch_semaphore_debug,
-};
+#pragma mark -
+#pragma mark dispatch_semaphore_t
 
 dispatch_semaphore_t
-_dispatch_get_thread_semaphore(void)
+dispatch_semaphore_create(long value)
 {
        dispatch_semaphore_t dsema;
-       
-       dsema = fastpath(_dispatch_thread_getspecific(dispatch_sema4_key));
-       if (!dsema) {
-               while (!(dsema = dispatch_semaphore_create(0))) {
-                       sleep(1);
-               }
+
+       // If the internal value is negative, then the absolute of the value is
+       // equal to the number of waiting threads. Therefore it is bogus to
+       // initialize the semaphore with a negative value.
+       if (value < 0) {
+               return DISPATCH_BAD_INPUT;
        }
-       _dispatch_thread_setspecific(dispatch_sema4_key, NULL);
+
+       dsema = (dispatch_semaphore_t)_dispatch_object_alloc(
+                       DISPATCH_VTABLE(semaphore), sizeof(struct dispatch_semaphore_s));
+       _dispatch_semaphore_class_init(value, dsema);
+       dsema->dsema_orig = value;
        return dsema;
 }
 
 void
-_dispatch_put_thread_semaphore(dispatch_semaphore_t dsema)
+_dispatch_semaphore_dispose(dispatch_object_t dou,
+               DISPATCH_UNUSED bool *allow_free)
 {
-       dispatch_semaphore_t old_sema = _dispatch_thread_getspecific(dispatch_sema4_key);
-       _dispatch_thread_setspecific(dispatch_sema4_key, dsema);
-       if (old_sema) {
-               dispatch_release(old_sema);
+       dispatch_semaphore_t dsema = dou._dsema;
+
+       if (dsema->dsema_value < dsema->dsema_orig) {
+               DISPATCH_CLIENT_CRASH(dsema->dsema_orig - dsema->dsema_value,
+                               "Semaphore object deallocated while in use");
        }
+
+       _dispatch_sema4_dispose(&dsema->dsema_sema, _DSEMA4_POLICY_FIFO);
 }
 
-dispatch_group_t
-dispatch_group_create(void)
+size_t
+_dispatch_semaphore_debug(dispatch_object_t dou, char *buf, size_t bufsiz)
 {
-       return (dispatch_group_t)dispatch_semaphore_create(LONG_MAX);
+       dispatch_semaphore_t dsema = dou._dsema;
+
+       size_t offset = 0;
+       offset += dsnprintf(&buf[offset], bufsiz - offset, "%s[%p] = { ",
+                       dx_kind(dsema), dsema);
+       offset += _dispatch_object_debug_attr(dsema, &buf[offset], bufsiz - offset);
+#if USE_MACH_SEM
+       offset += dsnprintf(&buf[offset], bufsiz - offset, "port = 0x%u, ",
+                       dsema->dsema_sema);
+#endif
+       offset += dsnprintf(&buf[offset], bufsiz - offset,
+                       "value = %ld, orig = %ld }", dsema->dsema_value, dsema->dsema_orig);
+       return offset;
 }
 
-dispatch_semaphore_t
-dispatch_semaphore_create(long value)
+DISPATCH_NOINLINE
+long
+_dispatch_semaphore_signal_slow(dispatch_semaphore_t dsema)
 {
-       dispatch_semaphore_t dsema;
-       
-       // If the internal value is negative, then the absolute of the value is
-       // equal to the number of waiting threads. Therefore it is bogus to
-       // initialize the semaphore with a negative value.
-       if (value < 0) {
-               return NULL;
-       }
-       
-       dsema = calloc(1, sizeof(struct dispatch_semaphore_s));
-       
-       if (fastpath(dsema)) {
-               dsema->do_vtable = &_dispatch_semaphore_vtable;
-               dsema->do_next = DISPATCH_OBJECT_LISTLESS;
-               dsema->do_ref_cnt = 1;
-               dsema->do_xref_cnt = 1;
-               dsema->do_targetq = dispatch_get_global_queue(0, 0);
-               dsema->dsema_value = value;
-               dsema->dsema_orig = value;
-       }
-       
-       return dsema;
+       _dispatch_sema4_create(&dsema->dsema_sema, _DSEMA4_POLICY_FIFO);
+       _dispatch_sema4_signal(&dsema->dsema_sema, 1);
+       return 1;
 }
 
-static void
-_dispatch_semaphore_create_port(semaphore_t *s4)
+long
+dispatch_semaphore_signal(dispatch_semaphore_t dsema)
 {
-       kern_return_t kr;
-       semaphore_t tmp;
-
-       if (*s4) {
-               return;
-       }
-       
-       // lazily allocate the semaphore port
-       
-       // Someday:
-       // 1) Switch to a doubly-linked FIFO in user-space.
-       // 2) User-space timers for the timeout.
-       // 3) Use the per-thread semaphore port.
-       
-       while (dispatch_assume_zero(kr = semaphore_create(mach_task_self(), &tmp, SYNC_POLICY_FIFO, 0))) {
-               DISPATCH_VERIFY_MIG(kr);
-               sleep(1);
+       long value = os_atomic_inc2o(dsema, dsema_value, release);
+       if (fastpath(value > 0)) {
+               return 0;
        }
-       
-       if (!dispatch_atomic_cmpxchg(s4, 0, tmp)) {
-               kr = semaphore_destroy(mach_task_self(), tmp);
-               DISPATCH_SEMAPHORE_VERIFY_KR(kr);
+       if (slowpath(value == LONG_MIN)) {
+               DISPATCH_CLIENT_CRASH(value,
+                               "Unbalanced call to dispatch_semaphore_signal()");
        }
-
-       _dispatch_safe_fork = false;
+       return _dispatch_semaphore_signal_slow(dsema);
 }
 
 DISPATCH_NOINLINE
 static long
-_dispatch_semaphore_wait_slow(dispatch_semaphore_t dsema, dispatch_time_t timeout)
+_dispatch_semaphore_wait_slow(dispatch_semaphore_t dsema,
+               dispatch_time_t timeout)
 {
-       mach_timespec_t _timeout;
-       kern_return_t kr;
-       uint64_t nsec;
        long orig;
-       
-again:
-       // Mach semaphores appear to sometimes spuriously wake up.  Therefore,
-       // we keep a parallel count of the number of times a Mach semaphore is
-       // signaled.
-       while ((orig = dsema->dsema_sent_ksignals)) {
-               if (dispatch_atomic_cmpxchg(&dsema->dsema_sent_ksignals, orig, orig - 1)) {
-                       return 0;
-               }
-       }
-
-       _dispatch_semaphore_create_port(&dsema->dsema_port);
-
-       // From xnu/osfmk/kern/sync_sema.c:
-       // wait_semaphore->count = -1;  /* we don't keep an actual count */
-       //
-       // The code above does not match the documentation, and that fact is
-       // not surprising. The documented semantics are clumsy to use in any
-       // practical way. The above hack effectively tricks the rest of the
-       // Mach semaphore logic to behave like the libdispatch algorithm.
 
+       _dispatch_sema4_create(&dsema->dsema_sema, _DSEMA4_POLICY_FIFO);
        switch (timeout) {
        default:
-               do {
-                       // timeout() already calculates relative time left
-                       nsec = _dispatch_timeout(timeout);
-                       _timeout.tv_sec = (typeof(_timeout.tv_sec))(nsec / NSEC_PER_SEC);
-                       _timeout.tv_nsec = (typeof(_timeout.tv_nsec))(nsec % NSEC_PER_SEC);
-                       kr = slowpath(semaphore_timedwait(dsema->dsema_port, _timeout));
-               } while (kr == KERN_ABORTED);
-
-               if (kr != KERN_OPERATION_TIMED_OUT) {
-                       DISPATCH_SEMAPHORE_VERIFY_KR(kr);
+               if (!_dispatch_sema4_timedwait(&dsema->dsema_sema, timeout)) {
                        break;
                }
-               // Fall through and try to undo what the fast path did to dsema->dsema_value
+               // Fall through and try to undo what the fast path did to
+               // dsema->dsema_value
        case DISPATCH_TIME_NOW:
-               while ((orig = dsema->dsema_value) < 0) {
-                       if (dispatch_atomic_cmpxchg(&dsema->dsema_value, orig, orig + 1)) {
-                               return KERN_OPERATION_TIMED_OUT;
+               orig = dsema->dsema_value;
+               while (orig < 0) {
+                       if (os_atomic_cmpxchgvw2o(dsema, dsema_value, orig, orig + 1,
+                                       &orig, relaxed)) {
+                               return _DSEMA4_TIMEOUT();
                        }
                }
                // Another thread called semaphore_signal().
                // Fall through and drain the wakeup.
        case DISPATCH_TIME_FOREVER:
-               do {
-                       kr = semaphore_wait(dsema->dsema_port);
-               } while (kr == KERN_ABORTED);
-               DISPATCH_SEMAPHORE_VERIFY_KR(kr);
+               _dispatch_sema4_wait(&dsema->dsema_sema);
                break;
        }
-
-       goto again;
-}
-
-DISPATCH_NOINLINE
-void
-dispatch_group_enter(dispatch_group_t dg)
-{
-       dispatch_semaphore_t dsema = (dispatch_semaphore_t)dg;
-#if defined(__OPTIMIZE__) && defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-       // This assumes:
-       // 1) Way too much about the optimizer of GCC.
-       // 2) There will never be more than LONG_MAX threads.
-       //    Therefore: no overflow detection
-       asm(
-#ifdef __LP64__ 
-               "lock decq      %0\n\t"
-#else
-               "lock decl      %0\n\t"
-#endif
-               "js     1f\n\t"
-               "xor    %%eax, %%eax\n\t"
-               "ret\n\t"
-               "1:"
-               : "+m" (dsema->dsema_value)
-               :
-               : "cc"
-       );
-       _dispatch_semaphore_wait_slow(dsema, DISPATCH_TIME_FOREVER);
-#else
-       dispatch_semaphore_wait(dsema, DISPATCH_TIME_FOREVER);
-#endif
+       return 0;
 }
 
-DISPATCH_NOINLINE
 long
 dispatch_semaphore_wait(dispatch_semaphore_t dsema, dispatch_time_t timeout)
 {
-#if defined(__OPTIMIZE__) && defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-       // This assumes:
-       // 1) Way too much about the optimizer of GCC.
-       // 2) There will never be more than LONG_MAX threads.
-       //    Therefore: no overflow detection
-       asm(
-#ifdef __LP64__ 
-               "lock decq      %0\n\t"
-#else
-               "lock decl      %0\n\t"
-#endif
-               "js     1f\n\t"
-               "xor    %%eax, %%eax\n\t"
-               "ret\n\t"
-               "1:"
-               : "+m" (dsema->dsema_value)
-               :
-               : "cc"
-       );
-#else
-       if (dispatch_atomic_dec(&dsema->dsema_value) >= 0) {
+       long value = os_atomic_dec2o(dsema, dsema_value, acquire);
+       if (fastpath(value >= 0)) {
                return 0;
        }
-#endif
        return _dispatch_semaphore_wait_slow(dsema, timeout);
 }
 
-DISPATCH_NOINLINE
-static long
-_dispatch_semaphore_signal_slow(dispatch_semaphore_t dsema)
+#pragma mark -
+#pragma mark dispatch_group_t
+
+DISPATCH_ALWAYS_INLINE
+static inline dispatch_group_t
+_dispatch_group_create_with_count(long count)
 {
-       kern_return_t kr;
-       
-       _dispatch_semaphore_create_port(&dsema->dsema_port);
-
-       // Before dsema_sent_ksignals is incremented we can rely on the reference
-       // held by the waiter. However, once this value is incremented the waiter
-       // may return between the atomic increment and the semaphore_signal(),
-       // therefore an explicit reference must be held in order to safely access
-       // dsema after the atomic increment.
-       _dispatch_retain(dsema);
-       
-       dispatch_atomic_inc(&dsema->dsema_sent_ksignals);
-       
-       kr = semaphore_signal(dsema->dsema_port);
-       DISPATCH_SEMAPHORE_VERIFY_KR(kr);
-
-       _dispatch_release(dsema);
-       
-       return 1;
+       dispatch_group_t dg = (dispatch_group_t)_dispatch_object_alloc(
+                       DISPATCH_VTABLE(group), sizeof(struct dispatch_group_s));
+       _dispatch_semaphore_class_init(count, dg);
+       if (count) {
+               os_atomic_store2o(dg, do_ref_cnt, 1, relaxed); // <rdar://problem/22318411>
+       }
+       return dg;
 }
 
-void
-dispatch_group_leave(dispatch_group_t dg)
+dispatch_group_t
+dispatch_group_create(void)
 {
-       dispatch_semaphore_t dsema = (dispatch_semaphore_t)dg;
-
-       dispatch_semaphore_signal(dsema);
+       return _dispatch_group_create_with_count(0);
+}
 
-       if (dsema->dsema_value == dsema->dsema_orig) {
-               _dispatch_group_wake(dsema);
-       }
+dispatch_group_t
+_dispatch_group_create_and_enter(void)
+{
+       return _dispatch_group_create_with_count(1);
 }
 
-DISPATCH_NOINLINE
-long
-dispatch_semaphore_signal(dispatch_semaphore_t dsema)
+void
+dispatch_group_enter(dispatch_group_t dg)
 {
-#if defined(__OPTIMIZE__) && defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-       // overflow detection
-       // this assumes way too much about the optimizer of GCC
-       asm(
-#ifdef __LP64__ 
-               "lock incq      %0\n\t"
-#else
-               "lock incl      %0\n\t"
-#endif
-               "jo     1f\n\t"
-               "jle    2f\n\t"
-               "xor    %%eax, %%eax\n\t"
-               "ret\n\t"
-               "1:\n\t"
-               "int    $4\n\t"
-               "2:"
-               : "+m" (dsema->dsema_value)
-               :
-               : "cc"
-       );
-#else
-       if (dispatch_atomic_inc(&dsema->dsema_value) > 0) {
-               return 0;
+       long value = os_atomic_inc_orig2o(dg, dg_value, acquire);
+       if (slowpath((unsigned long)value >= (unsigned long)LONG_MAX)) {
+               DISPATCH_CLIENT_CRASH(value,
+                               "Too many nested calls to dispatch_group_enter()");
+       }
+       if (value == 0) {
+               _dispatch_retain(dg); // <rdar://problem/22318411>
        }
-#endif
-       return _dispatch_semaphore_signal_slow(dsema);
 }
 
 DISPATCH_NOINLINE
-long
-_dispatch_group_wake(dispatch_semaphore_t dsema)
+static long
+_dispatch_group_wake(dispatch_group_t dg, bool needs_release)
 {
-       struct dispatch_sema_notify_s *tmp, *head = dispatch_atomic_xchg(&dsema->dsema_notify_head, NULL);
-       long rval = dispatch_atomic_xchg(&dsema->dsema_group_waiters, 0);
-       bool do_rel = head;
-       long kr;
-
-       // wake any "group" waiter or notify blocks
-       
+       dispatch_continuation_t next, head, tail = NULL;
+       long rval;
+
+       // cannot use os_mpsc_capture_snapshot() because we can have concurrent
+       // _dispatch_group_wake() calls
+       head = os_atomic_xchg2o(dg, dg_notify_head, NULL, relaxed);
+       if (head) {
+               // snapshot before anything is notified/woken <rdar://problem/8554546>
+               tail = os_atomic_xchg2o(dg, dg_notify_tail, NULL, release);
+       }
+       rval = (long)os_atomic_xchg2o(dg, dg_waiters, 0, relaxed);
        if (rval) {
-               _dispatch_semaphore_create_port(&dsema->dsema_waiter_port);
-               do {
-                       kr = semaphore_signal(dsema->dsema_waiter_port);
-                       DISPATCH_SEMAPHORE_VERIFY_KR(kr);
-               } while (--rval);
+               // wake group waiters
+               _dispatch_sema4_create(&dg->dg_sema, _DSEMA4_POLICY_FIFO);
+               _dispatch_sema4_signal(&dg->dg_sema, rval);
        }
-       while (head) {
-               dispatch_async_f(head->dsn_queue, head->dsn_ctxt, head->dsn_func);
-               _dispatch_release(head->dsn_queue);
+       uint16_t refs = needs_release ? 1 : 0; // <rdar://problem/22318411>
+       if (head) {
+               // async group notify blocks
                do {
-                       tmp = head->dsn_next;
-               } while (!tmp && !dispatch_atomic_cmpxchg(&dsema->dsema_notify_tail, head, NULL));
-               free(head);
-               head = tmp;
-       }
-       if (do_rel) {
-               _dispatch_release(dsema);
+                       next = os_mpsc_pop_snapshot_head(head, tail, do_next);
+                       dispatch_queue_t dsn_queue = (dispatch_queue_t)head->dc_data;
+                       _dispatch_continuation_async(dsn_queue, head);
+                       _dispatch_release(dsn_queue);
+               } while ((head = next));
+               refs++;
        }
+       if (refs) _dispatch_release_n(dg, refs);
        return 0;
 }
 
+void
+dispatch_group_leave(dispatch_group_t dg)
+{
+       long value = os_atomic_dec2o(dg, dg_value, release);
+       if (slowpath(value == 0)) {
+               return (void)_dispatch_group_wake(dg, true);
+       }
+       if (slowpath(value < 0)) {
+               DISPATCH_CLIENT_CRASH(value,
+                               "Unbalanced call to dispatch_group_leave()");
+       }
+}
+
+void
+_dispatch_group_dispose(dispatch_object_t dou, DISPATCH_UNUSED bool *allow_free)
+{
+       dispatch_group_t dg = dou._dg;
+
+       if (dg->dg_value) {
+               DISPATCH_CLIENT_CRASH(dg->dg_value,
+                               "Group object deallocated while in use");
+       }
+
+       _dispatch_sema4_dispose(&dg->dg_sema, _DSEMA4_POLICY_FIFO);
+}
+
+size_t
+_dispatch_group_debug(dispatch_object_t dou, char *buf, size_t bufsiz)
+{
+       dispatch_group_t dg = dou._dg;
+
+       size_t offset = 0;
+       offset += dsnprintf(&buf[offset], bufsiz - offset, "%s[%p] = { ",
+                       dx_kind(dg), dg);
+       offset += _dispatch_object_debug_attr(dg, &buf[offset], bufsiz - offset);
+#if USE_MACH_SEM
+       offset += dsnprintf(&buf[offset], bufsiz - offset, "port = 0x%u, ",
+                       dg->dg_sema);
+#endif
+       offset += dsnprintf(&buf[offset], bufsiz - offset,
+                       "count = %ld, waiters = %d }", dg->dg_value, dg->dg_waiters);
+       return offset;
+}
+
 DISPATCH_NOINLINE
 static long
-_dispatch_group_wait_slow(dispatch_semaphore_t dsema, dispatch_time_t timeout)
+_dispatch_group_wait_slow(dispatch_group_t dg, dispatch_time_t timeout)
 {
-       mach_timespec_t _timeout;
-       kern_return_t kr;
-       uint64_t nsec;
-       long orig;
-       
-again:
-       // check before we cause another signal to be sent by incrementing dsema->dsema_group_waiters
-       if (dsema->dsema_value == dsema->dsema_orig) {
-               return _dispatch_group_wake(dsema);
+       long value;
+       int orig_waiters;
+
+       // check before we cause another signal to be sent by incrementing
+       // dg->dg_waiters
+       value = os_atomic_load2o(dg, dg_value, ordered); // 19296565
+       if (value == 0) {
+               return _dispatch_group_wake(dg, false);
        }
-       // Mach semaphores appear to sometimes spuriously wake up.  Therefore,
-       // we keep a parallel count of the number of times a Mach semaphore is
-       // signaled.
-       dispatch_atomic_inc(&dsema->dsema_group_waiters);
+
+       (void)os_atomic_inc2o(dg, dg_waiters, relaxed);
        // check the values again in case we need to wake any threads
-       if (dsema->dsema_value == dsema->dsema_orig) {
-               return _dispatch_group_wake(dsema);
+       value = os_atomic_load2o(dg, dg_value, ordered); // 19296565
+       if (value == 0) {
+               _dispatch_group_wake(dg, false);
+               // Fall through to consume the extra signal, forcing timeout to avoid
+               // useless setups as it won't block
+               timeout = DISPATCH_TIME_FOREVER;
        }
 
-       _dispatch_semaphore_create_port(&dsema->dsema_waiter_port);
-       
-       // From xnu/osfmk/kern/sync_sema.c:
-       // wait_semaphore->count = -1;  /* we don't keep an actual count */
-       //
-       // The code above does not match the documentation, and that fact is
-       // not surprising. The documented semantics are clumsy to use in any
-       // practical way. The above hack effectively tricks the rest of the
-       // Mach semaphore logic to behave like the libdispatch algorithm.
-       
+       _dispatch_sema4_create(&dg->dg_sema, _DSEMA4_POLICY_FIFO);
        switch (timeout) {
        default:
-               do {
-                       nsec = _dispatch_timeout(timeout);
-                       _timeout.tv_sec = (typeof(_timeout.tv_sec))(nsec / NSEC_PER_SEC);
-                       _timeout.tv_nsec = (typeof(_timeout.tv_nsec))(nsec % NSEC_PER_SEC);
-                       kr = slowpath(semaphore_timedwait(dsema->dsema_waiter_port, _timeout));
-               } while (kr == KERN_ABORTED);
-               if (kr != KERN_OPERATION_TIMED_OUT) {
-                       DISPATCH_SEMAPHORE_VERIFY_KR(kr);
+               if (!_dispatch_sema4_timedwait(&dg->dg_sema, timeout)) {
                        break;
                }
-               // Fall through and try to undo the earlier change to dsema->dsema_group_waiters
+               // Fall through and try to undo the earlier change to
+               // dg->dg_waiters
        case DISPATCH_TIME_NOW:
-               while ((orig = dsema->dsema_group_waiters)) {
-                       if (dispatch_atomic_cmpxchg(&dsema->dsema_group_waiters, orig, orig - 1)) {
-                               return KERN_OPERATION_TIMED_OUT;
+               orig_waiters = dg->dg_waiters;
+               while (orig_waiters) {
+                       if (os_atomic_cmpxchgvw2o(dg, dg_waiters, orig_waiters,
+                                       orig_waiters - 1, &orig_waiters, relaxed)) {
+                               return _DSEMA4_TIMEOUT();
                        }
                }
-               // Another thread called semaphore_signal().
+               // Another thread is running _dispatch_group_wake()
                // Fall through and drain the wakeup.
        case DISPATCH_TIME_FOREVER:
-               do {
-                       kr = semaphore_wait(dsema->dsema_waiter_port);
-               } while (kr == KERN_ABORTED);
-               DISPATCH_SEMAPHORE_VERIFY_KR(kr);
+               _dispatch_sema4_wait(&dg->dg_sema);
                break;
        }
-
-       goto again;
+       return 0;
 }
 
 long
 dispatch_group_wait(dispatch_group_t dg, dispatch_time_t timeout)
 {
-       dispatch_semaphore_t dsema = (dispatch_semaphore_t)dg;
-
-       if (dsema->dsema_value == dsema->dsema_orig) {
+       if (dg->dg_value == 0) {
                return 0;
        }
        if (timeout == 0) {
-               return KERN_OPERATION_TIMED_OUT;
+               return _DSEMA4_TIMEOUT();
        }
-       return _dispatch_group_wait_slow(dsema, timeout);
-}
-
-#ifdef __BLOCKS__
-void
-dispatch_group_notify(dispatch_group_t dg, dispatch_queue_t dq, dispatch_block_t db)
-{
-       dispatch_group_notify_f(dg, dq, _dispatch_Block_copy(db), _dispatch_call_block_and_release);
+       return _dispatch_group_wait_slow(dg, timeout);
 }
-#endif
 
-void
-dispatch_group_notify_f(dispatch_group_t dg, dispatch_queue_t dq, void *ctxt, void (*func)(void *))
+DISPATCH_ALWAYS_INLINE
+static inline void
+_dispatch_group_notify(dispatch_group_t dg, dispatch_queue_t dq,
+               dispatch_continuation_t dsn)
 {
-       dispatch_semaphore_t dsema = (dispatch_semaphore_t)dg;
-       struct dispatch_sema_notify_s *dsn, *prev;
-
-       // FIXME -- this should be updated to use the continuation cache
-       while (!(dsn = malloc(sizeof(*dsn)))) {
-               sleep(1);
-       }
-
-       dsn->dsn_next = NULL;
-       dsn->dsn_queue = dq;
-       dsn->dsn_ctxt = ctxt;
-       dsn->dsn_func = func;
+       dsn->dc_data = dq;
+       dsn->do_next = NULL;
        _dispatch_retain(dq);
-
-       prev = dispatch_atomic_xchg(&dsema->dsema_notify_tail, dsn);
-       if (fastpath(prev)) {
-               prev->dsn_next = dsn;
-       } else {
+       if (os_mpsc_push_update_tail(dg, dg_notify, dsn, do_next)) {
                _dispatch_retain(dg);
-               dsema->dsema_notify_head = dsn;
-               if (dsema->dsema_value == dsema->dsema_orig) {
-                       _dispatch_group_wake(dsema);
+               os_atomic_store2o(dg, dg_notify_head, dsn, ordered);
+               // seq_cst with atomic store to notify_head <rdar://problem/11750916>
+               if (os_atomic_load2o(dg, dg_value, ordered) == 0) {
+                       _dispatch_group_wake(dg, false);
                }
        }
 }
 
+DISPATCH_NOINLINE
 void
-_dispatch_semaphore_dispose(dispatch_semaphore_t dsema)
+dispatch_group_notify_f(dispatch_group_t dg, dispatch_queue_t dq, void *ctxt,
+               dispatch_function_t func)
 {
-       kern_return_t kr;
-       
-       if (dsema->dsema_value < dsema->dsema_orig) {
-               DISPATCH_CLIENT_CRASH("Semaphore/group object deallocated while in use");
-       }
-       
-       if (dsema->dsema_port) {
-               kr = semaphore_destroy(mach_task_self(), dsema->dsema_port);
-               DISPATCH_SEMAPHORE_VERIFY_KR(kr);
-       }
-       if (dsema->dsema_waiter_port) {
-               kr = semaphore_destroy(mach_task_self(), dsema->dsema_waiter_port);
-               DISPATCH_SEMAPHORE_VERIFY_KR(kr);
-       }
-       
-       _dispatch_dispose(dsema);
-}
-
-size_t
-_dispatch_semaphore_debug(dispatch_semaphore_t dsema, char *buf, size_t bufsiz)
-{
-       size_t offset = 0;
-       offset += snprintf(&buf[offset], bufsiz - offset, "%s[%p] = { ", dx_kind(dsema), dsema);
-       offset += dispatch_object_debug_attr(dsema, &buf[offset], bufsiz - offset);
-       offset += snprintf(&buf[offset], bufsiz - offset, "port = 0x%u, value = %ld, orig = %ld }",
-                                          dsema->dsema_port, dsema->dsema_value, dsema->dsema_orig);
-       return offset;
+       dispatch_continuation_t dsn = _dispatch_continuation_alloc();
+       _dispatch_continuation_init_f(dsn, dq, ctxt, func, 0, 0,
+                       DISPATCH_OBJ_CONSUME_BIT);
+       _dispatch_group_notify(dg, dq, dsn);
 }
 
 #ifdef __BLOCKS__
 void
-dispatch_group_async(dispatch_group_t dg, dispatch_queue_t dq, dispatch_block_t db)
+dispatch_group_notify(dispatch_group_t dg, dispatch_queue_t dq,
+               dispatch_block_t db)
 {
-       dispatch_group_async_f(dg, dq, _dispatch_Block_copy(db), _dispatch_call_block_and_release);
+       dispatch_continuation_t dsn = _dispatch_continuation_alloc();
+       _dispatch_continuation_init(dsn, dq, db, 0, 0, DISPATCH_OBJ_CONSUME_BIT);
+       _dispatch_group_notify(dg, dq, dsn);
 }
 #endif
-
-DISPATCH_NOINLINE
-void
-dispatch_group_async_f(dispatch_group_t dg, dispatch_queue_t dq, void *ctxt, void (*func)(void *))
-{
-       dispatch_continuation_t dc;
-
-       _dispatch_retain(dg);
-       dispatch_group_enter(dg);
-
-       dc = _dispatch_continuation_alloc_cacheonly() ?: _dispatch_continuation_alloc_from_heap();
-
-       dc->do_vtable = (void *)(DISPATCH_OBJ_ASYNC_BIT|DISPATCH_OBJ_GROUP_BIT);
-       dc->dc_func = func;
-       dc->dc_ctxt = ctxt;
-       dc->dc_group = dg;
-
-       _dispatch_queue_push(dq, dc);
-}