]> git.saurik.com Git - apple/libpthread.git/blobdiff - src/pthread_rwlock.c
libpthread-454.100.8.tar.gz
[apple/libpthread.git] / src / pthread_rwlock.c
index c7b5373a87893dfc42a439baca50194f0c9a2687..55834c925f9c5296a07d8648fd78008c3ac528c6 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2000-2003, 2007, 2008 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
@@ -17,7 +17,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_LICENSE_HEADER_END@
  */
 /*-
  * $FreeBSD: src/lib/libc_r/uthread/uthread_rwlock.c,v 1.6 2001/04/10 04:19:20 deischen Exp $
  */
 
-/* 
- * POSIX Pthread Library 
+/*
+ * POSIX Pthread Library
  * -- Read Write Lock support
  * 4/24/02: A. Ramesh
  *        Ported from FreeBSD
  */
 
+#include "resolver.h"
 #include "internal.h"
-#include <stdio.h>      /* For printf(). */
-
-extern int __unix_conforming;
 
 #ifdef PLOCKSTAT
 #include "plockstat.h"
@@ -76,40 +74,285 @@ extern int __unix_conforming;
 #define BLOCK_FAIL_PLOCKSTAT    0
 #define BLOCK_SUCCESS_PLOCKSTAT 1
 
-/* maximum number of times a read lock may be obtained */
-#define        MAX_READ_LOCKS          (INT_MAX - 1) 
+#define PTHREAD_RWLOCK_INIT_UNUSED 1
+
+// maximum number of times a read lock may be obtained
+#define        MAX_READ_LOCKS          (INT_MAX - 1)
+
+
+#if defined(__LP64__)
+#define RWLOCK_USE_INT128 1
+#endif
+
+typedef union rwlock_seq {
+       uint32_t seq[4];
+       struct { uint32_t lcntval; uint32_t rw_seq; uint32_t ucntval; };
+       struct { uint32_t lgen; uint32_t rw_wc; uint32_t ugen; };
+#if RWLOCK_USE_INT128
+       unsigned __int128 seq_LSU;
+       unsigned __int128 _Atomic atomic_seq_LSU;
+#endif
+       struct {
+               uint64_t seq_LS;
+               uint32_t seq_U;
+               uint32_t _pad;
+       };
+       struct {
+               uint64_t _Atomic atomic_seq_LS;
+               uint32_t _Atomic atomic_seq_U;
+               uint32_t _Atomic _atomic_pad;
+       };
+} rwlock_seq;
+
+_Static_assert(sizeof(rwlock_seq) == 4 * sizeof(uint32_t),
+               "Incorrect rwlock_seq size");
+
+typedef enum rwlock_seqfields {
+       RWLOCK_SEQ_NONE = 0,
+       RWLOCK_SEQ_LS = 1,
+       RWLOCK_SEQ_U = 2,
+       RWLOCK_SEQ_LSU = RWLOCK_SEQ_LS | RWLOCK_SEQ_U,
+} rwlock_seqfields;
+
+#if PTHREAD_DEBUG_LOG
+#define RWLOCK_DEBUG_SEQ(op, rwlock, oldseq, newseq, updateval, f) \
+               if (_pthread_debuglog >= 0) { \
+               _simple_dprintf(_pthread_debuglog, "rw_" #op " %p tck %7llu thr %llx " \
+               "L %x -> %x S %x -> %x U %x -> %x updt %x\n", rwlock, \
+               mach_absolute_time() - _pthread_debugstart, _pthread_threadid_self_np_direct(), \
+               (f) & RWLOCK_SEQ_LS ? (oldseq).lcntval : 0, \
+               (f) & RWLOCK_SEQ_LS ? (newseq).lcntval : 0, \
+               (f) & RWLOCK_SEQ_LS ? (oldseq).rw_seq  : 0, \
+               (f) & RWLOCK_SEQ_LS ? (newseq).rw_seq  : 0, \
+               (f) & RWLOCK_SEQ_U  ? (oldseq).ucntval : 0, \
+               (f) & RWLOCK_SEQ_U  ? (newseq).ucntval : 0, updateval); }
+#else
+#define RWLOCK_DEBUG_SEQ(m, rwlock, oldseq, newseq, updateval, f)
+#endif
+
+#if !__LITTLE_ENDIAN__
+#error RWLOCK_GETSEQ_ADDR assumes little endian layout of sequence words
+#endif
+
+OS_ALWAYS_INLINE
+static inline void
+RWLOCK_GETSEQ_ADDR(pthread_rwlock_t *rwlock, rwlock_seq **seqaddr)
+{
+       // 128-bit aligned address inside rw_seq & rw_mis arrays
+       *seqaddr = (void*)(((uintptr_t)rwlock->rw_seq + 0xful) & ~0xful);
+}
+
+OS_ALWAYS_INLINE
+static inline void
+RWLOCK_GETTID_ADDR(pthread_rwlock_t *rwlock, uint64_t **tidaddr)
+{
+       // 64-bit aligned address inside rw_tid array (&rw_tid[0] for aligned lock)
+       *tidaddr = (void*)(((uintptr_t)rwlock->rw_tid + 0x7ul) & ~0x7ul);
+}
+
+OS_ALWAYS_INLINE
+static inline void
+rwlock_seq_load(rwlock_seq *seqaddr, rwlock_seq *oldseqval,
+               const rwlock_seqfields seqfields)
+{
+       switch (seqfields) {
+       case RWLOCK_SEQ_LSU:
+#if RWLOCK_USE_INT128
+               oldseqval->seq_LSU = seqaddr->seq_LSU;
+#else
+               oldseqval->seq_LS = seqaddr->seq_LS;
+               oldseqval->seq_U = seqaddr->seq_U;
+#endif
+               break;
+       case RWLOCK_SEQ_LS:
+               oldseqval->seq_LS = seqaddr->seq_LS;
+               break;
+#if DEBUG // unused
+       case RWLOCK_SEQ_U:
+               oldseqval->seq_U = seqaddr->seq_U;
+               break;
+#endif // unused
+       default:
+               __builtin_trap();
+       }
+}
+
+OS_ALWAYS_INLINE
+static inline void
+rwlock_seq_atomic_load_relaxed(rwlock_seq *seqaddr, rwlock_seq *oldseqval,
+               const rwlock_seqfields seqfields)
+{
+       switch (seqfields) {
+       case RWLOCK_SEQ_LSU:
+#if RWLOCK_USE_INT128
+#if defined(__arm64__) && defined(__ARM_ARCH_8_2__)
+               // Workaround clang armv81 codegen bug for 128bit os_atomic_load
+               // rdar://problem/31213932
+               oldseqval->seq_LSU = seqaddr->seq_LSU;
+               while (!os_atomic_cmpxchgv(&seqaddr->atomic_seq_LSU,
+                               oldseqval->seq_LSU, oldseqval->seq_LSU, &oldseqval->seq_LSU,
+                               relaxed));
+#else
+               oldseqval->seq_LSU = os_atomic_load_wide(&seqaddr->atomic_seq_LSU, relaxed);
+#endif
+#else
+               oldseqval->seq_LS = os_atomic_load_wide(&seqaddr->atomic_seq_LS, relaxed);
+               oldseqval->seq_U = os_atomic_load(&seqaddr->atomic_seq_U, relaxed);
+#endif
+               break;
+       case RWLOCK_SEQ_LS:
+               oldseqval->seq_LS = os_atomic_load_wide(&seqaddr->atomic_seq_LS, relaxed);
+               break;
+#if DEBUG // unused
+       case RWLOCK_SEQ_U:
+               oldseqval->seq_U = os_atomic_load(&seqaddr->atomic_seq_U, relaxed);
+               break;
+#endif // unused
+       default:
+               __builtin_trap();
+       }
+}
+
+#define rwlock_seq_atomic_load(seqaddr, oldseqval, seqfields, m) \
+               rwlock_seq_atomic_load_##m(seqaddr, oldseqval, seqfields)
 
-#include <platform/string.h>
-#include <platform/compat.h>
+OS_ALWAYS_INLINE
+static inline rwlock_seqfields
+rwlock_seq_atomic_cmpxchgv_relaxed(rwlock_seq *seqaddr, rwlock_seq *oldseqval,
+               rwlock_seq *newseqval, const rwlock_seqfields seqfields)
+{
+       bool r;
+       rwlock_seqfields updated_seqfields = RWLOCK_SEQ_NONE;
+       switch (seqfields) {
+#if DEBUG // unused
+       case RWLOCK_SEQ_LSU:
+#if RWLOCK_USE_INT128
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_LSU, oldseqval->seq_LSU,
+                               newseqval->seq_LSU, &oldseqval->seq_LSU, relaxed);
+               if (r) updated_seqfields = RWLOCK_SEQ_LSU;
+#else
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_LS, oldseqval->seq_LS,
+                               newseqval->seq_LS, &oldseqval->seq_LS, relaxed);
+               if (r) {
+                       r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_U, oldseqval->seq_U,
+                                       newseqval->seq_U, &oldseqval->seq_U, relaxed);
+                       if (!r) oldseqval->seq_LS = newseqval->seq_LS;
+                       updated_seqfields = r ? RWLOCK_SEQ_LSU : RWLOCK_SEQ_LS;
+               } else {
+                       oldseqval->seq_U = os_atomic_load(&seqaddr->atomic_seq_U, relaxed);
+               }
+#endif
+               break;
+       case RWLOCK_SEQ_U:
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_U, oldseqval->seq_U,
+                               newseqval->seq_U, &oldseqval->seq_U, relaxed);
+               if (r) updated_seqfields = RWLOCK_SEQ_U;
+               break;
+#endif // unused
+       case RWLOCK_SEQ_LS:
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_LS, oldseqval->seq_LS,
+                               newseqval->seq_LS, &oldseqval->seq_LS, relaxed);
+               if (r) updated_seqfields = RWLOCK_SEQ_LS;
+               break;
+       default:
+               __builtin_trap();
+       }
+       return updated_seqfields;
+}
 
-__private_extern__ int __pthread_rwlock_init(_pthread_rwlock *rwlock, const pthread_rwlockattr_t *attr);
-__private_extern__ void _pthread_rwlock_updateval(_pthread_rwlock *rwlock, uint32_t updateval);
+OS_ALWAYS_INLINE
+static inline rwlock_seqfields
+rwlock_seq_atomic_cmpxchgv_acquire(rwlock_seq *seqaddr, rwlock_seq *oldseqval,
+               rwlock_seq *newseqval, const rwlock_seqfields seqfields)
+{
+       bool r;
+       rwlock_seqfields updated_seqfields = RWLOCK_SEQ_NONE;
+       switch (seqfields) {
+#if DEBUG // unused
+       case RWLOCK_SEQ_LSU:
+#if RWLOCK_USE_INT128
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_LSU, oldseqval->seq_LSU,
+                               newseqval->seq_LSU, &oldseqval->seq_LSU, acquire);
+               if (r) updated_seqfields = RWLOCK_SEQ_LSU;
+#else
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_LS, oldseqval->seq_LS,
+                               newseqval->seq_LS, &oldseqval->seq_LS, acquire);
+               if (r) {
+                       r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_U, oldseqval->seq_U,
+                                       newseqval->seq_U, &oldseqval->seq_U, relaxed);
+                       if (!r) oldseqval->seq_LS = newseqval->seq_LS;
+                       updated_seqfields = r ? RWLOCK_SEQ_LSU : RWLOCK_SEQ_LS;
+               } else {
+                       oldseqval->seq_U = os_atomic_load(&seqaddr->atomic_seq_U, relaxed);
+               }
+#endif
+               break;
+       case RWLOCK_SEQ_U:
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_U, oldseqval->seq_U,
+                               newseqval->seq_U, &oldseqval->seq_U, acquire);
+               if (r) updated_seqfields = RWLOCK_SEQ_U;
+               break;
+#endif // unused
+       case RWLOCK_SEQ_LS:
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_LS, oldseqval->seq_LS,
+                               newseqval->seq_LS, &oldseqval->seq_LS, acquire);
+               if (r) updated_seqfields = RWLOCK_SEQ_LS;
+               break;
+       default:
+               __builtin_trap();
+       }
+       return updated_seqfields;
+}
 
-static void
-RWLOCK_GETSEQ_ADDR(_pthread_rwlock *rwlock,
-                  volatile uint32_t **lcntaddr,
-                  volatile uint32_t **ucntaddr,
-                  volatile uint32_t **seqaddr)
+OS_ALWAYS_INLINE
+static inline rwlock_seqfields
+rwlock_seq_atomic_cmpxchgv_release(rwlock_seq *seqaddr, rwlock_seq *oldseqval,
+               rwlock_seq *newseqval, const rwlock_seqfields seqfields)
 {
-       if (rwlock->pshared == PTHREAD_PROCESS_SHARED) {
-               if (rwlock->misalign) {
-                       *lcntaddr = &rwlock->rw_seq[1];
-                       *seqaddr = &rwlock->rw_seq[2];
-                       *ucntaddr = &rwlock->rw_seq[3];
+       bool r;
+       rwlock_seqfields updated_seqfields = RWLOCK_SEQ_NONE;
+       switch (seqfields) {
+       case RWLOCK_SEQ_LSU:
+#if RWLOCK_USE_INT128
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_LSU, oldseqval->seq_LSU,
+                               newseqval->seq_LSU, &oldseqval->seq_LSU, release);
+               if (r) updated_seqfields = RWLOCK_SEQ_LSU;
+#else
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_U, oldseqval->seq_U,
+                               newseqval->seq_U, &oldseqval->seq_U, release);
+               if (r) {
+                       r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_LS, oldseqval->seq_LS,
+                                       newseqval->seq_LS, &oldseqval->seq_LS, relaxed);
+                       if (!r) oldseqval->seq_U = newseqval->seq_U;
+                       updated_seqfields = r ? RWLOCK_SEQ_LSU : RWLOCK_SEQ_U;
                } else {
-                       *lcntaddr = &rwlock->rw_seq[0];
-                       *seqaddr = &rwlock->rw_seq[1];
-                       *ucntaddr = &rwlock->rw_seq[2];
+                       oldseqval->seq_LS = os_atomic_load_wide(&seqaddr->atomic_seq_LS,
+                                       relaxed);
                }
-       } else {
-               *lcntaddr = rwlock->rw_lcntaddr;
-               *seqaddr = rwlock->rw_seqaddr;
-               *ucntaddr = rwlock->rw_ucntaddr;
+#endif
+               break;
+       case RWLOCK_SEQ_LS:
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_LS, oldseqval->seq_LS,
+                               newseqval->seq_LS, &oldseqval->seq_LS, release);
+               if (r) updated_seqfields = RWLOCK_SEQ_LS;
+               break;
+#if DEBUG // unused
+       case RWLOCK_SEQ_U:
+               r = os_atomic_cmpxchgv(&seqaddr->atomic_seq_U, oldseqval->seq_U,
+                               newseqval->seq_U, &oldseqval->seq_U, release);
+               if (r) updated_seqfields = RWLOCK_SEQ_U;
+               break;
+#endif // unused
+       default:
+               __builtin_trap();
        }
+       return updated_seqfields;
 }
 
+#define rwlock_seq_atomic_cmpxchgv(seqaddr, oldseqval, newseqval, seqfields, m)\
+               rwlock_seq_atomic_cmpxchgv_##m(seqaddr, oldseqval, newseqval, seqfields)
+
 #ifndef BUILDING_VARIANT /* [ */
-static uint32_t modbits(uint32_t lgenval, uint32_t updateval, uint32_t savebits);
 
 int
 pthread_rwlockattr_init(pthread_rwlockattr_t *attr)
@@ -119,7 +362,7 @@ pthread_rwlockattr_init(pthread_rwlockattr_t *attr)
        return 0;
 }
 
-int    
+int
 pthread_rwlockattr_destroy(pthread_rwlockattr_t *attr)
 {
        attr->sig = _PTHREAD_NO_SIG;
@@ -143,11 +386,8 @@ pthread_rwlockattr_setpshared(pthread_rwlockattr_t * attr, int pshared)
 {
        int res = EINVAL;
        if (attr->sig == _PTHREAD_RWLOCK_ATTR_SIG) {
-#if __DARWIN_UNIX03
-               if (( pshared == PTHREAD_PROCESS_PRIVATE) || (pshared == PTHREAD_PROCESS_SHARED))
-#else /* __DARWIN_UNIX03 */
-               if ( pshared == PTHREAD_PROCESS_PRIVATE)
-#endif /* __DARWIN_UNIX03 */
+               if (( pshared == PTHREAD_PROCESS_PRIVATE) ||
+                               (pshared == PTHREAD_PROCESS_SHARED))
                {
                        attr->pshared = pshared ;
                        res = 0;
@@ -156,16 +396,34 @@ pthread_rwlockattr_setpshared(pthread_rwlockattr_t * attr, int pshared)
        return res;
 }
 
-__private_extern__ int
-__pthread_rwlock_init(_pthread_rwlock *rwlock, const pthread_rwlockattr_t *attr)
+#endif /* !BUILDING_VARIANT ] */
+
+OS_ALWAYS_INLINE
+static inline int
+_pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
 {
-       // Force RWLOCK_GETSEQ_ADDR to calculate addresses by setting pshared.
-       rwlock->pshared = PTHREAD_PROCESS_SHARED;
-       rwlock->misalign = (((uintptr_t)&rwlock->rw_seq[0]) & 0x7) != 0;
-       RWLOCK_GETSEQ_ADDR(rwlock, &rwlock->rw_lcntaddr, &rwlock->rw_ucntaddr, &rwlock->rw_seqaddr);
-       *rwlock->rw_lcntaddr = PTHRW_RWLOCK_INIT;
-       *rwlock->rw_seqaddr = PTHRW_RWS_INIT;
-       *rwlock->rw_ucntaddr = 0;
+       uint64_t *tidaddr;
+       RWLOCK_GETTID_ADDR(rwlock, &tidaddr);
+
+       rwlock_seq *seqaddr;
+       RWLOCK_GETSEQ_ADDR(rwlock, &seqaddr);
+
+#if PTHREAD_RWLOCK_INIT_UNUSED
+       if ((uint32_t*)tidaddr != rwlock->rw_tid) {
+               rwlock->misalign = 1;
+               __builtin_memset(rwlock->rw_tid, 0xff, sizeof(rwlock->rw_tid));
+       }
+       if ((uint32_t*)seqaddr != rwlock->rw_seq) {
+               __builtin_memset(rwlock->rw_seq, 0xff, sizeof(rwlock->rw_seq));
+       }
+       __builtin_memset(rwlock->rw_mis, 0xff, sizeof(rwlock->rw_mis));
+#endif // PTHREAD_MUTEX_INIT_UNUSED
+       *tidaddr = 0;
+       *seqaddr = (rwlock_seq){
+               .lcntval = PTHRW_RWLOCK_INIT,
+               .rw_seq = PTHRW_RWS_INIT,
+               .ucntval = 0,
+       };
 
        if (attr != NULL && attr->pshared == PTHREAD_PROCESS_SHARED) {
                rwlock->pshared = PTHREAD_PROCESS_SHARED;
@@ -174,33 +432,56 @@ __pthread_rwlock_init(_pthread_rwlock *rwlock, const pthread_rwlockattr_t *attr)
                rwlock->pshared = _PTHREAD_DEFAULT_PSHARED;
                rwlock->rw_flags = PTHRW_KERN_PROCESS_PRIVATE;
        }
-               
-       rwlock->rw_owner = NULL;
+
+       long sig = _PTHREAD_RWLOCK_SIG;
+
+#if DEBUG
        bzero(rwlock->_reserved, sizeof(rwlock->_reserved));
+#endif
+#if PTHREAD_RWLOCK_INIT_UNUSED
+       // For detecting copied rwlocks and smashes during debugging
+       uint32_t sig32 = (uint32_t)sig;
+       uintptr_t guard = ~(uintptr_t)rwlock; // use ~ to hide from leaks
+       __builtin_memcpy(rwlock->_reserved, &guard, sizeof(guard));
+#define countof(x) (sizeof(x) / sizeof(x[0]))
+       rwlock->_reserved[countof(rwlock->_reserved) - 1] = sig32;
+#if defined(__LP64__)
+       rwlock->_pad = sig32;
+#endif
+#endif // PTHREAD_RWLOCK_INIT_UNUSED
 
        // Ensure all contents are properly set before setting signature.
-       OSMemoryBarrier();
-       rwlock->sig = _PTHREAD_RWLOCK_SIG;
-       
+#if defined(__LP64__)
+       // For binary compatibility reasons we cannot require natural alignment of
+       // the 64bit 'sig' long value in the struct. rdar://problem/21610439
+       uint32_t *sig32_ptr = (uint32_t*)&rwlock->sig;
+       uint32_t *sig32_val = (uint32_t*)&sig;
+       *(sig32_ptr + 1) = *(sig32_val + 1);
+       os_atomic_store(sig32_ptr, *sig32_val, release);
+#else
+       os_atomic_store(&rwlock->sig, sig, release);
+#endif
+
        return 0;
 }
 
 static uint32_t
-modbits(uint32_t lgenval, uint32_t updateval, uint32_t savebits)
+_pthread_rwlock_modbits(uint32_t lgenval, uint32_t updateval, uint32_t savebits)
 {
        uint32_t lval = lgenval & PTHRW_BIT_MASK;
        uint32_t uval = updateval & PTHRW_BIT_MASK;
        uint32_t rval, nlval;
 
        nlval = (lval | uval) & ~(PTH_RWL_MBIT);
-       
-       /* reconcile bits on the lock with what kernel needs to set */
+
+       // reconcile bits on the lock with what kernel needs to set
        if ((uval & PTH_RWL_KBIT) == 0 && (lval & PTH_RWL_WBIT) == 0) {
                nlval &= ~PTH_RWL_KBIT;
        }
 
        if (savebits != 0) {
-               if ((savebits & PTH_RWS_WSVBIT) != 0 && (nlval & PTH_RWL_WBIT) == 0 && (nlval & PTH_RWL_EBIT) == 0) {
+               if ((savebits & PTH_RWS_WSVBIT) != 0 && (nlval & PTH_RWL_WBIT) == 0 &&
+                               (nlval & PTH_RWL_EBIT) == 0) {
                        nlval |= (PTH_RWL_WBIT | PTH_RWL_KBIT);
                }
        }
@@ -208,422 +489,568 @@ modbits(uint32_t lgenval, uint32_t updateval, uint32_t savebits)
        return(rval);
 }
 
-__private_extern__ void
-_pthread_rwlock_updateval(_pthread_rwlock *rwlock, uint32_t updateval)
+OS_ALWAYS_INLINE
+static inline void
+_pthread_rwlock_updateval(pthread_rwlock_t *rwlock, uint32_t updateval)
 {
        bool isoverlap = (updateval & PTH_RWL_MBIT) != 0;
 
-       uint64_t oldval64, newval64;
-       volatile uint32_t *lcntaddr, *ucntaddr, *seqaddr;
-
-       /* TBD: restore U bit */
-       RWLOCK_GETSEQ_ADDR(rwlock, &lcntaddr, &ucntaddr, &seqaddr);
+       // TBD: restore U bit
+       rwlock_seq *seqaddr;
+       RWLOCK_GETSEQ_ADDR(rwlock, &seqaddr);
 
+       rwlock_seq oldseq, newseq;
+       rwlock_seq_load(seqaddr, &oldseq, RWLOCK_SEQ_LS);
        do {
-               uint32_t lcntval = *lcntaddr;
-               uint32_t rw_seq = *seqaddr;
-               
-               uint32_t newval, newsval;
-               if (isoverlap || is_rws_setunlockinit(rw_seq) != 0) {
+               newseq = oldseq;
+               if (isoverlap || is_rws_unlockinit_set(oldseq.rw_seq)) {
                        // Set S word to the specified value
-                       uint32_t savebits = (rw_seq & PTHRW_RWS_SAVEMASK);
-                       newval = modbits(lcntval, updateval, savebits);
-                       newsval = rw_seq + (updateval & PTHRW_COUNT_MASK);
+                       uint32_t savebits = (oldseq.rw_seq & PTHRW_RWS_SAVEMASK);
+                       newseq.lcntval = _pthread_rwlock_modbits(oldseq.lcntval, updateval,
+                                       savebits);
+                       newseq.rw_seq += (updateval & PTHRW_COUNT_MASK);
                        if (!isoverlap) {
-                               newsval &= PTHRW_COUNT_MASK;
+                               newseq.rw_seq &= PTHRW_COUNT_MASK;
                        }
-                       newsval &= ~PTHRW_RWS_SAVEMASK;
-               } else {
-                       newval = lcntval;
-                       newsval = rw_seq;
+                       newseq.rw_seq &= ~PTHRW_RWS_SAVEMASK;
                }
-
-               oldval64 = (((uint64_t)rw_seq) << 32);
-               oldval64 |= lcntval;
-               newval64 = (((uint64_t)newsval) << 32);
-               newval64 |= newval;
-       } while (OSAtomicCompareAndSwap64Barrier(oldval64, newval64, (volatile int64_t *)lcntaddr) != TRUE);
+       } while (!rwlock_seq_atomic_cmpxchgv(seqaddr, &oldseq, &newseq,
+                       RWLOCK_SEQ_LS, relaxed));
+       RWLOCK_DEBUG_SEQ(update, rwlock, oldseq, newseq, updateval, RWLOCK_SEQ_LS);
 }
 
-#endif /* !BUILDING_VARIANT ] */
-
-static int
-_pthread_rwlock_check_busy(_pthread_rwlock *rwlock)
+OS_ALWAYS_INLINE
+static inline int
+_pthread_rwlock_check_busy(pthread_rwlock_t *rwlock)
 {
        int res = 0;
-       
-       volatile uint32_t *lcntaddr, *ucntaddr, *seqaddr;
-       
-       RWLOCK_GETSEQ_ADDR(rwlock, &lcntaddr, &ucntaddr, &seqaddr);
-       
-       uint32_t rw_lcnt = *lcntaddr;
-       uint32_t rw_ucnt = *ucntaddr;
-       
-       if ((rw_lcnt & PTHRW_COUNT_MASK) != rw_ucnt) {
+
+       rwlock_seq *seqaddr;
+       RWLOCK_GETSEQ_ADDR(rwlock, &seqaddr);
+
+       rwlock_seq seq;
+       rwlock_seq_atomic_load(seqaddr, &seq, RWLOCK_SEQ_LSU, relaxed);
+       if ((seq.lcntval & PTHRW_COUNT_MASK) != seq.ucntval) {
                res = EBUSY;
        }
-       
+
        return res;
 }
 
+PTHREAD_NOEXPORT_VARIANT
 int
-pthread_rwlock_destroy(pthread_rwlock_t *orwlock)
+pthread_rwlock_destroy(pthread_rwlock_t *rwlock)
 {
        int res = 0;
-       _pthread_rwlock *rwlock = (_pthread_rwlock *)orwlock;
 
-       if (rwlock->sig == _PTHREAD_RWLOCK_SIG) {
-#if __DARWIN_UNIX03
+       _pthread_lock_lock(&rwlock->lock);
+       if (_pthread_rwlock_check_signature(rwlock)) {
                res = _pthread_rwlock_check_busy(rwlock);
-#endif /* __DARWIN_UNIX03 */
-       } else if (rwlock->sig != _PTHREAD_RWLOCK_SIG_init) {
+       } else if (!_pthread_rwlock_check_signature_init(rwlock)) {
                res = EINVAL;
        }
        if (res == 0) {
                rwlock->sig = _PTHREAD_NO_SIG;
        }
+       _pthread_lock_unlock(&rwlock->lock);
        return res;
 }
 
-
+PTHREAD_NOEXPORT_VARIANT
 int
-pthread_rwlock_init(pthread_rwlock_t *orwlock, const pthread_rwlockattr_t *attr)
+pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
 {
        int res = 0;
-       _pthread_rwlock *rwlock = (_pthread_rwlock *)orwlock;
-       
-#if __DARWIN_UNIX03
+
        if (attr && attr->sig != _PTHREAD_RWLOCK_ATTR_SIG) {
                res = EINVAL;
        }
 
-       if (res == 0 && rwlock->sig == _PTHREAD_RWLOCK_SIG) {
+       if (res == 0 && _pthread_rwlock_check_signature(rwlock)) {
                res = _pthread_rwlock_check_busy(rwlock);
        }
-#endif
        if (res == 0) {
-               LOCK_INIT(rwlock->lock);
-               res = __pthread_rwlock_init(rwlock, attr);
+               _pthread_lock_init(&rwlock->lock);
+               res = _pthread_rwlock_init(rwlock, attr);
        }
        return res;
 }
 
+OS_NOINLINE
 static int
-_pthread_rwlock_check_init(pthread_rwlock_t *orwlock)
+_pthread_rwlock_check_init_slow(pthread_rwlock_t *rwlock)
 {
-       int res = 0;
-       _pthread_rwlock *rwlock = (_pthread_rwlock *)orwlock;
-       if (rwlock->sig != _PTHREAD_RWLOCK_SIG) {
-               res = EINVAL;
-               if (rwlock->sig == _PTHREAD_RWLOCK_SIG_init) {
-                       LOCK(rwlock->lock);
-                       if (rwlock->sig == _PTHREAD_RWLOCK_SIG_init) {
-                               res = __pthread_rwlock_init(rwlock, NULL);
-                       } else if (rwlock->sig == _PTHREAD_RWLOCK_SIG){
-                               res = 0;
-                       }
-                       UNLOCK(rwlock->lock);
-               }
-               if (res != 0) {
-                       PLOCKSTAT_RW_ERROR(orwlock, READ_LOCK_PLOCKSTAT, res);
+       int res = EINVAL;
+
+       if (_pthread_rwlock_check_signature_init(rwlock)) {
+               _pthread_lock_lock(&rwlock->lock);
+               if (_pthread_rwlock_check_signature_init(rwlock)) {
+                       res = _pthread_rwlock_init(rwlock, NULL);
+               } else if (_pthread_rwlock_check_signature(rwlock)){
+                       res = 0;
                }
+               _pthread_lock_unlock(&rwlock->lock);
+       } else if (_pthread_rwlock_check_signature(rwlock)){
+               res = 0;
+       }
+       if (res != 0) {
+               PLOCKSTAT_RW_ERROR(rwlock, READ_LOCK_PLOCKSTAT, res);
+       }
+       return res;
+}
+
+OS_ALWAYS_INLINE
+static inline int
+_pthread_rwlock_check_init(pthread_rwlock_t *rwlock)
+{
+       int res = 0;
+
+       if (!_pthread_rwlock_check_signature(rwlock)) {
+               return _pthread_rwlock_check_init_slow(rwlock);
        }
        return res;
 }
 
+OS_NOINLINE
 static int
-_pthread_rwlock_lock(pthread_rwlock_t *orwlock, bool readlock, bool trylock)
+_pthread_rwlock_lock_wait(pthread_rwlock_t *rwlock, bool readlock,
+               rwlock_seq newseq)
 {
        int res;
-       _pthread_rwlock *rwlock = (_pthread_rwlock *)orwlock;
 
-       res = _pthread_rwlock_check_init(orwlock);
-       if (res != 0) {
-               return res;
-       }
+#ifdef PLOCKSTAT
+       int plockstat = readlock ? READ_LOCK_PLOCKSTAT : WRITE_LOCK_PLOCKSTAT;
+#endif
 
-       uint64_t oldval64, newval64;
-       volatile uint32_t *lcntaddr, *ucntaddr, *seqaddr;
-       RWLOCK_GETSEQ_ADDR(rwlock, &lcntaddr, &ucntaddr, &seqaddr);
+       if (readlock) {
+               RWLOCK_DEBUG_SEQ(rdlock, rwlock, oldseq, newseq, gotlock,
+                               RWLOCK_SEQ_LSU);
+       } else {
+               RWLOCK_DEBUG_SEQ(wrlock, rwlock, oldseq, newseq, gotlock,
+                               RWLOCK_SEQ_LSU);
+       }
 
-       uint32_t newval, newsval;
-       uint32_t lcntval, ucntval, rw_seq;
+       uint32_t updateval;
 
-       bool gotlock;
-       bool retry;
-       int retry_count = 0;
+       PLOCKSTAT_RW_BLOCK(rwlock, plockstat);
 
        do {
-               res = 0;
-               retry = false;
-               
-               lcntval = *lcntaddr;
-               ucntval = *ucntaddr;
-               rw_seq = *seqaddr;
-
-#if __DARWIN_UNIX03
-               if (is_rwl_ebit_set(lcntval)) {
-                       if (rwlock->rw_owner == pthread_self()) {
-                               res = EDEADLK;
-                               break;
-                       }
+               if (readlock) {
+                       updateval = __psynch_rw_rdlock(rwlock, newseq.lcntval,
+                                       newseq.ucntval, newseq.rw_seq, rwlock->rw_flags);
+               } else {
+                       updateval = __psynch_rw_wrlock(rwlock, newseq.lcntval,
+                                       newseq.ucntval, newseq.rw_seq, rwlock->rw_flags);
                }
-#endif /* __DARWIN_UNIX03 */
+               if (updateval == (uint32_t)-1) {
+                       res = errno;
+               } else {
+                       res = 0;
+               }
+       } while (res == EINTR);
+
+       if (res == 0) {
+               _pthread_rwlock_updateval(rwlock, updateval);
+               PLOCKSTAT_RW_BLOCKED(rwlock, plockstat, BLOCK_SUCCESS_PLOCKSTAT);
+       } else {
+               PLOCKSTAT_RW_BLOCKED(rwlock, plockstat, BLOCK_FAIL_PLOCKSTAT);
+               PTHREAD_INTERNAL_CRASH(res, "kernel rwlock returned unknown error");
+       }
 
-               oldval64 = (((uint64_t)rw_seq) << 32);
-               oldval64 |= lcntval;
+       return res;
+}
 
-               /* if l bit is on or u and k bit is clear, acquire lock in userland */
+OS_NOINLINE
+int
+_pthread_rwlock_lock_slow(pthread_rwlock_t *rwlock, bool readlock,
+               bool trylock)
+{
+       int res;
+
+#ifdef PLOCKSTAT
+       int plockstat = readlock ? READ_LOCK_PLOCKSTAT : WRITE_LOCK_PLOCKSTAT;
+#endif
+
+       res = _pthread_rwlock_check_init(rwlock);
+       if (res != 0) return res;
+
+       rwlock_seq *seqaddr;
+       RWLOCK_GETSEQ_ADDR(rwlock, &seqaddr);
+
+       rwlock_seq oldseq, newseq;
+       rwlock_seq_atomic_load(seqaddr, &oldseq, RWLOCK_SEQ_LSU, relaxed);
+
+       uint64_t *tidaddr;
+       RWLOCK_GETTID_ADDR(rwlock, &tidaddr);
+       uint64_t selfid = _pthread_threadid_self_np_direct();
+       if (is_rwl_ebit_set(oldseq.lcntval)) {
+               if (os_atomic_load_wide(tidaddr, relaxed) == selfid) return EDEADLK;
+       }
+
+       int retry_count;
+       bool gotlock;
+       do {
+               retry_count = 0;
+retry:
+               newseq = oldseq;
+
+               // if W and K bit are clear or U bit is on, acquire lock in userland
                if (readlock) {
-                       gotlock = can_rwl_readinuser(lcntval);
+                       gotlock = (oldseq.lcntval & (PTH_RWL_WBIT | PTH_RWL_KBIT)) == 0;
                } else {
-                       gotlock = (lcntval & PTH_RWL_RBIT) != 0;
+                       gotlock = (oldseq.lcntval & PTH_RWL_UBIT) != 0;
                }
 
-               uint32_t bits = 0;
-               uint32_t mask = ~0ul;
-               
-               newval = lcntval + PTHRW_INC;
-
-               if (gotlock) {
+               if (trylock && !gotlock) {
+                       // A trylock on a held lock will fail immediately. But since
+                       // we did not load the sequence words atomically, perform a
+                       // no-op CAS to ensure that nobody has unlocked concurrently.
+               } else if (gotlock) {
                        if (readlock) {
-                               if (diff_genseq(lcntval, ucntval) >= PTHRW_MAX_READERS) {
-                                       /* since ucntval may be newer, just redo */
+                               if (diff_genseq(oldseq.lcntval, oldseq.ucntval) >=
+                                               PTHRW_MAX_READERS) {
+                                       // since ucntval may be newer, just redo
                                        retry_count++;
                                        if (retry_count > 1024) {
+                                               gotlock = false;
                                                res = EAGAIN;
-                                               break;
+                                               goto out;
                                        } else {
                                                sched_yield();
-                                               retry = true;
-                                               continue;
+                                               rwlock_seq_atomic_load(seqaddr, &oldseq,
+                                                               RWLOCK_SEQ_LSU, relaxed);
+                                               goto retry;
                                        }
                                }
-                               
-                               // Need to update L (remove R bit) and S word
-                               mask = PTH_RWLOCK_RESET_RBIT;
+                               // Need to update L (remove U bit) and S word
+                               newseq.lcntval &= ~PTH_RWL_UBIT;
                        } else {
-                               mask = PTHRW_COUNT_MASK;
-                               bits = PTH_RWL_IBIT | PTH_RWL_KBIT | PTH_RWL_EBIT;
+                               newseq.lcntval &= PTHRW_COUNT_MASK;
+                               newseq.lcntval |= PTH_RWL_IBIT | PTH_RWL_KBIT | PTH_RWL_EBIT;
                        }
-                       newsval = rw_seq + PTHRW_INC;
-               } else if (trylock) {
-                       res = EBUSY;
-                       break;
+                       newseq.lcntval += PTHRW_INC;
+                       newseq.rw_seq  += PTHRW_INC;
                } else {
                        if (readlock) {
-                               // Need to block in kernel. Remove R bit.
-                               mask = PTH_RWLOCK_RESET_RBIT;
+                               // Need to block in kernel. Remove U bit.
+                               newseq.lcntval &= ~PTH_RWL_UBIT;
                        } else {
-                               bits = PTH_RWL_KBIT | PTH_RWL_WBIT;
+                               newseq.lcntval |= PTH_RWL_KBIT | PTH_RWL_WBIT;
                        }
-                       newsval = rw_seq;
-                       if (is_rws_setseq(rw_seq)) {
-                               newsval &= PTHRW_SW_Reset_BIT_MASK;
-                               newsval |= (newval & PTHRW_COUNT_MASK);
+                       newseq.lcntval += PTHRW_INC;
+                       if (is_rws_sbit_set(oldseq.rw_seq)) {
+                               // Clear the S bit and set S to L
+                               newseq.rw_seq &= (PTHRW_BIT_MASK & ~PTH_RWS_SBIT);
+                               newseq.rw_seq |= (oldseq.lcntval & PTHRW_COUNT_MASK);
                        }
                }
-               newval = (newval & mask) | bits;
-               newval64 = (((uint64_t)newsval) << 32);
-               newval64 |= newval;
+       } while (!rwlock_seq_atomic_cmpxchgv(seqaddr, &oldseq, &newseq,
+                       RWLOCK_SEQ_LS, acquire));
 
-       } while (retry || OSAtomicCompareAndSwap64Barrier(oldval64, newval64, (volatile int64_t *)lcntaddr) != TRUE);
+       if (gotlock) {
+               if (!readlock) os_atomic_store_wide(tidaddr, selfid, relaxed);
+               res = 0;
+       } else if (trylock) {
+               res = EBUSY;
+       } else {
+               res = _pthread_rwlock_lock_wait(rwlock, readlock, newseq);
+       }
 
+out:
 #ifdef PLOCKSTAT
-       int plockstat = readlock ? READ_LOCK_PLOCKSTAT : WRITE_LOCK_PLOCKSTAT;
+       if (res == 0) {
+               PLOCKSTAT_RW_ACQUIRE(rwlock, plockstat);
+       } else {
+               PLOCKSTAT_RW_ERROR(rwlock, plockstat, res);
+       }
+#endif
+
+       return res;
+}
+
+OS_ALWAYS_INLINE
+static inline int
+_pthread_rwlock_lock(pthread_rwlock_t *rwlock, bool readlock, bool trylock)
+{
+#if PLOCKSTAT
+       if (PLOCKSTAT_RW_ACQUIRE_ENABLED() || PLOCKSTAT_RW_ERROR_ENABLED()) {
+               return _pthread_rwlock_lock_slow(rwlock, readlock, trylock);
+       }
 #endif
 
-       // Unable to acquire in userland, transition to kernel.
-       if (res == 0 && !gotlock) {
-               uint32_t updateval;
+       if (os_unlikely(!_pthread_rwlock_check_signature(rwlock))) {
+               return _pthread_rwlock_lock_slow(rwlock, readlock, trylock);
+       }
+
+       rwlock_seq *seqaddr;
+       RWLOCK_GETSEQ_ADDR(rwlock, &seqaddr);
+
+       rwlock_seq oldseq, newseq;
+       // no need to perform a single-copy-atomic 128-bit load in the fastpath,
+       // if stores to L and U are seen out of order, we will fallback to the
+       // slowpath below (which has rwlock_seq_atomic_load)
+       rwlock_seq_load(seqaddr, &oldseq, RWLOCK_SEQ_LSU);
+
+       if (os_unlikely(is_rwl_ebit_set(oldseq.lcntval))) {
+               return _pthread_rwlock_lock_slow(rwlock, readlock, trylock);
+       }
+
+       bool gotlock;
+       do {
+               newseq = oldseq;
+
+               // if W and K bit are clear or U bit is on, acquire lock in userland
+               if (readlock) {
+                       gotlock = (oldseq.lcntval & (PTH_RWL_WBIT | PTH_RWL_KBIT)) == 0;
+               } else {
+                       gotlock = (oldseq.lcntval & PTH_RWL_UBIT) != 0;
+               }
 
-               PLOCKSTAT_RW_BLOCK(orwlock, plockstat);
-               
-               do {
+               if (trylock && !gotlock) {
+                       // A trylock on a held lock will fail immediately. But since
+                       // we did not load the sequence words atomically, perform a
+                       // no-op CAS to ensure that nobody has unlocked concurrently.
+               } else if (os_likely(gotlock)) {
                        if (readlock) {
-                               updateval = __psynch_rw_rdlock(orwlock, newval, ucntval, newsval, rwlock->rw_flags);
-                       } else {
-                               updateval = __psynch_rw_wrlock(orwlock, newval, ucntval, newsval, rwlock->rw_flags);
-                       }
-                       if (updateval == (uint32_t)-1) {
-                               res = errno;
+                               if (os_unlikely(diff_genseq(oldseq.lcntval, oldseq.ucntval) >=
+                                               PTHRW_MAX_READERS)) {
+                                       return _pthread_rwlock_lock_slow(rwlock, readlock, trylock);
+                               }
+                               // Need to update L (remove U bit) and S word
+                               newseq.lcntval &= ~PTH_RWL_UBIT;
                        } else {
-                               res = 0;
+                               newseq.lcntval &= PTHRW_COUNT_MASK;
+                               newseq.lcntval |= PTH_RWL_IBIT | PTH_RWL_KBIT | PTH_RWL_EBIT;
                        }
-               } while (res == EINTR);
-               
-               if (res == 0) {
-                       _pthread_rwlock_updateval(rwlock, updateval);
-                       PLOCKSTAT_RW_BLOCKED(orwlock, plockstat, BLOCK_SUCCESS_PLOCKSTAT);
+                       newseq.lcntval += PTHRW_INC;
+                       newseq.rw_seq  += PTHRW_INC;
                } else {
-                       PLOCKSTAT_RW_BLOCKED(orwlock, plockstat, BLOCK_FAIL_PLOCKSTAT);
-                       uint64_t myid;
-                       (void)pthread_threadid_np(pthread_self(), &myid);
-                       PTHREAD_ABORT("kernel lock returned unknown error %x with tid %x\n", updateval, (uint32_t)myid);
+                       return _pthread_rwlock_lock_slow(rwlock, readlock, trylock);
                }
-       }
-       
-       if (res == 0) {
-#if __DARWIN_UNIX03
+       } while (os_unlikely(!rwlock_seq_atomic_cmpxchgv(seqaddr, &oldseq, &newseq,
+                       RWLOCK_SEQ_LS, acquire)));
+
+       if (os_likely(gotlock)) {
                if (!readlock) {
-                       rwlock->rw_owner = pthread_self();
+                       uint64_t *tidaddr;
+                       RWLOCK_GETTID_ADDR(rwlock, &tidaddr);
+                       uint64_t selfid = _pthread_threadid_self_np_direct();
+                       os_atomic_store_wide(tidaddr, selfid, relaxed);
                }
-#endif /* __DARWIN_UNIX03 */
-               PLOCKSTAT_RW_ACQUIRE(orwlock, plockstat);
+               return 0;
+       } else if (trylock) {
+               return EBUSY;
        } else {
-               PLOCKSTAT_RW_ERROR(orwlock, plockstat, res);
+               __builtin_trap();
        }
-       
-       return res;
 }
 
+PTHREAD_NOEXPORT_VARIANT
 int
-pthread_rwlock_rdlock(pthread_rwlock_t *orwlock)
+pthread_rwlock_rdlock(pthread_rwlock_t *rwlock)
 {
        // read lock, no try
-       return _pthread_rwlock_lock(orwlock, true, false);
+       return _pthread_rwlock_lock(rwlock, true, false);
 }
 
+PTHREAD_NOEXPORT_VARIANT
 int
-pthread_rwlock_tryrdlock(pthread_rwlock_t *orwlock)
+pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock)
 {
        // read lock, try lock
-       return _pthread_rwlock_lock(orwlock, true, true);
+       return _pthread_rwlock_lock(rwlock, true, true);
 }
 
+PTHREAD_NOEXPORT_VARIANT
 int
-pthread_rwlock_wrlock(pthread_rwlock_t *orwlock)
+pthread_rwlock_wrlock(pthread_rwlock_t *rwlock)
 {
        // write lock, no try
-       return _pthread_rwlock_lock(orwlock, false, false);
+       return _pthread_rwlock_lock(rwlock, false, false);
 }
 
+PTHREAD_NOEXPORT_VARIANT
 int
-pthread_rwlock_trywrlock(pthread_rwlock_t *orwlock)
+pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock)
 {
        // write lock, try lock
-       return _pthread_rwlock_lock(orwlock, false, true);
+       return _pthread_rwlock_lock(rwlock, false, true);
 }
 
+OS_NOINLINE
+static int
+_pthread_rwlock_unlock_drop(pthread_rwlock_t *rwlock, rwlock_seq oldseq,
+               rwlock_seq newseq)
+{
+       int res;
+
+       RWLOCK_DEBUG_SEQ(unlock, rwlock, oldseq, newseq, !droplock, RWLOCK_SEQ_LSU);
+       uint32_t updateval;
+       do {
+               updateval = __psynch_rw_unlock(rwlock, oldseq.lcntval,
+                               newseq.ucntval, newseq.rw_seq, rwlock->rw_flags);
+               if (updateval == (uint32_t)-1) {
+                       res = errno;
+               } else {
+                       res = 0;
+                       RWLOCK_DEBUG_SEQ(wakeup, rwlock, oldseq, newseq, updateval,
+                                       RWLOCK_SEQ_LSU);
+               }
+       } while (res == EINTR);
+
+       if (res != 0) {
+               PTHREAD_INTERNAL_CRASH(res, "kernel rwunlock returned unknown error");
+       }
+
+       return res;
+}
+
+OS_NOINLINE
 int
-pthread_rwlock_unlock(pthread_rwlock_t *orwlock)
+_pthread_rwlock_unlock_slow(pthread_rwlock_t *rwlock,
+               rwlock_seqfields updated_seqfields)
 {
        int res;
-       _pthread_rwlock *rwlock = (_pthread_rwlock *)orwlock;
+       rwlock_seqfields seqfields = RWLOCK_SEQ_LSU;
 #ifdef PLOCKSTAT
        int wrlock = 0;
 #endif
 
-       res = _pthread_rwlock_check_init(orwlock);
-       if (res != 0) {
-               return res;
+       res = _pthread_rwlock_check_init(rwlock);
+       if (res != 0) return res;
+
+       rwlock_seq *seqaddr;
+       RWLOCK_GETSEQ_ADDR(rwlock, &seqaddr);
+
+       rwlock_seq oldseq, newseq;
+       rwlock_seq_load(seqaddr, &oldseq, seqfields);
+
+       if ((oldseq.lcntval & PTH_RWL_UBIT) != 0) {
+               // spurious unlock (unlock of unlocked lock)
+               return 0;
        }
 
-       uint64_t oldval64 = 0, newval64 = 0;
-       volatile uint32_t *lcntaddr, *ucntaddr, *seqaddr;
-       RWLOCK_GETSEQ_ADDR(rwlock, &lcntaddr, &ucntaddr, &seqaddr);
+       if (is_rwl_ebit_set(oldseq.lcntval)) {
+#ifdef PLOCKSTAT
+               wrlock = 1;
+#endif
+               uint64_t *tidaddr;
+               RWLOCK_GETTID_ADDR(rwlock, &tidaddr);
+               os_atomic_store_wide(tidaddr, 0, relaxed);
+       }
 
        bool droplock;
-       bool reload;
-       bool incr_ucnt = true;
-       bool check_spurious = true;
-       uint32_t lcntval, ucntval, rw_seq, ulval = 0, newval, newsval;
-
        do {
-               reload = false;
-               droplock = true;
+               // stop loading & updating fields that have successfully been stored
+               seqfields &= ~updated_seqfields;
 
-               lcntval = *lcntaddr;
-               ucntval = *ucntaddr;
-               rw_seq = *seqaddr;
-
-               oldval64 = (((uint64_t)rw_seq) << 32);
-               oldval64 |= lcntval;
+               newseq = oldseq;
+               if (seqfields & RWLOCK_SEQ_U) {
+                       newseq.ucntval += PTHRW_INC;
+               }
 
-               // check for spurious unlocks
-               if (check_spurious) {
-                       if ((lcntval & PTH_RWL_RBIT) != 0) {
-                               droplock = false;
+               droplock = false;
+               uint32_t oldlcnt = (oldseq.lcntval & PTHRW_COUNT_MASK);
+               if (newseq.ucntval == oldlcnt) {
+                       // last unlock, set L with U and init bits and set S to L with S bit
+                       newseq.lcntval = oldlcnt | PTHRW_RWLOCK_INIT;
+                       newseq.rw_seq =  oldlcnt | PTHRW_RWS_INIT;
+               } else {
+                       // no L/S update if lock is not exclusive or no writer pending
+                       if ((oldseq.lcntval &
+                                       (PTH_RWL_EBIT | PTH_RWL_WBIT | PTH_RWL_KBIT)) == 0) {
+                               continue;
+                       }
 
-                               newval64 = oldval64;
+                       // kernel transition only needed if U == S
+                       if (newseq.ucntval != (oldseq.rw_seq & PTHRW_COUNT_MASK)) {
                                continue;
                        }
-                       check_spurious = false;
+
+                       droplock = true;
+                       // reset all bits and set K
+                       newseq.lcntval = oldlcnt | PTH_RWL_KBIT;
+                       // set I bit on S word
+                       newseq.rw_seq |= PTH_RWS_IBIT;
+                       if ((oldseq.lcntval & PTH_RWL_WBIT) != 0) {
+                               newseq.rw_seq |= PTH_RWS_WSVBIT;
+                       }
                }
+       } while (seqfields != (updated_seqfields = rwlock_seq_atomic_cmpxchgv(
+                       seqaddr, &oldseq, &newseq, seqfields, release)));
 
-               if (is_rwl_ebit_set(lcntval)) {
-#ifdef PLOCKSTAT
-                       wrlock = 1;
+       if (droplock) {
+               res = _pthread_rwlock_unlock_drop(rwlock, oldseq, newseq);
+       }
+
+       PLOCKSTAT_RW_RELEASE(rwlock, wrlock);
+
+       return res;
+}
+
+PTHREAD_NOEXPORT_VARIANT
+int
+pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
+{
+       rwlock_seqfields seqfields = RWLOCK_SEQ_LSU;
+       rwlock_seqfields updated_seqfields = RWLOCK_SEQ_NONE;
+
+#if PLOCKSTAT
+       if (PLOCKSTAT_RW_RELEASE_ENABLED() || PLOCKSTAT_RW_ERROR_ENABLED()) {
+               return _pthread_rwlock_unlock_slow(rwlock, updated_seqfields);
+       }
 #endif
-#if __DARWIN_UNIX03
-                       rwlock->rw_owner = NULL;
-#endif /* __DARWIN_UNIX03 */
-               }
 
-               // update U
-               if (incr_ucnt) {
-                       ulval = (ucntval + PTHRW_INC);
-                       incr_ucnt = (OSAtomicCompareAndSwap32Barrier(ucntval, ulval, (volatile int32_t *)ucntaddr) != TRUE);
-                       newval64 = oldval64;
-                       reload = true;
-                       continue;
-               }
+       if (os_unlikely(!_pthread_rwlock_check_signature(rwlock))) {
+               return _pthread_rwlock_unlock_slow(rwlock, updated_seqfields);
+       }
 
-               // last unlock, note U is already updated ?
-               if ((lcntval & PTHRW_COUNT_MASK) == (ulval & PTHRW_COUNT_MASK)) {
-                       /* Set L with R and init bits and set S to L */
-                       newval  = (lcntval & PTHRW_COUNT_MASK)| PTHRW_RWLOCK_INIT;
-                       newsval = (lcntval & PTHRW_COUNT_MASK)| PTHRW_RWS_INIT;
+       rwlock_seq *seqaddr;
+       RWLOCK_GETSEQ_ADDR(rwlock, &seqaddr);
 
-                       droplock = false;
-               } else {
-                       /* if it is not exclusive or no Writer/yield pending, skip */
-                       if ((lcntval & (PTH_RWL_EBIT | PTH_RWL_WBIT | PTH_RWL_KBIT)) == 0) {
-                               droplock = false;
-                               break;
-                       }
+       rwlock_seq oldseq, newseq;
+       rwlock_seq_load(seqaddr, &oldseq, seqfields);
 
-                       /* kernel transition needed? */
-                       /* U+1 == S? */
-                       if ((ulval + PTHRW_INC) != (rw_seq & PTHRW_COUNT_MASK)) {
-                               droplock = false;
-                               break;
-                       }
+       if (os_unlikely(oldseq.lcntval & PTH_RWL_UBIT)) {
+               // spurious unlock (unlock of unlocked lock)
+               return 0;
+       }
 
-                       /* reset all bits and set k */
-                       newval = (lcntval & PTHRW_COUNT_MASK) | PTH_RWL_KBIT;
-                       /* set I bit on S word */       
-                       newsval = rw_seq | PTH_RWS_IBIT;
-                       if ((lcntval & PTH_RWL_WBIT) != 0) {
-                               newsval |= PTH_RWS_WSVBIT;
-                       }
-               }
+       if (is_rwl_ebit_set(oldseq.lcntval)) {
+               uint64_t *tidaddr;
+               RWLOCK_GETTID_ADDR(rwlock, &tidaddr);
+               os_atomic_store_wide(tidaddr, 0, relaxed);
+       }
 
-               newval64 = (((uint64_t)newsval) << 32);
-               newval64 |= newval;
+       do {
+               if (updated_seqfields) {
+                       return _pthread_rwlock_unlock_slow(rwlock, updated_seqfields);
+               }
 
-       } while (OSAtomicCompareAndSwap64Barrier(oldval64, newval64, (volatile int64_t *)lcntaddr) != TRUE || reload);
+               newseq = oldseq;
+               if (seqfields & RWLOCK_SEQ_U) {
+                       newseq.ucntval += PTHRW_INC;
+               }
 
-       if (droplock) {
-               uint32_t updateval;
-               do {
-                       updateval = __psynch_rw_unlock(orwlock, lcntval, ulval, newsval, rwlock->rw_flags);
-                       if (updateval == (uint32_t)-1) {
-                               res = errno;
+               uint32_t oldlcnt = (oldseq.lcntval & PTHRW_COUNT_MASK);
+               if (os_likely(newseq.ucntval == oldlcnt)) {
+                       // last unlock, set L with U and init bits and set S to L with S bit
+                       newseq.lcntval = oldlcnt | PTHRW_RWLOCK_INIT;
+                       newseq.rw_seq =  oldlcnt | PTHRW_RWS_INIT;
+               } else {
+                       if (os_likely((oldseq.lcntval &
+                                       (PTH_RWL_EBIT | PTH_RWL_WBIT | PTH_RWL_KBIT)) == 0 ||
+                                       newseq.ucntval != (oldseq.rw_seq & PTHRW_COUNT_MASK))) {
+                               // no L/S update if lock is not exclusive or no writer pending
+                               // kernel transition only needed if U == S
                        } else {
-                               res = 0;
+                               return _pthread_rwlock_unlock_slow(rwlock, updated_seqfields);
                        }
-               } while (res == EINTR);
-
-               if (res != 0) {
-                       uint64_t myid = 0;
-                       (void)pthread_threadid_np(pthread_self(), &myid);
-                       PTHREAD_ABORT("rwunlock from kernel with unknown error %x: tid %x\n", res, (uint32_t)myid);
                }
-       }
+       } while (os_unlikely(seqfields != (updated_seqfields =
+                       rwlock_seq_atomic_cmpxchgv(seqaddr, &oldseq, &newseq, seqfields,
+                       release))));
 
-       PLOCKSTAT_RW_RELEASE(orwlock, wrlock);
-
-       return res;
+       return 0;
 }