From: Apple <opensource@apple.com>
Date: Fri, 22 Jan 2010 22:55:51 +0000 (+0000)
Subject: Libc-594.1.4.tar.gz
X-Git-Tag: mac-os-x-1063^0
X-Git-Url: https://git.saurik.com/apple/libc.git/commitdiff_plain/51282358e8fdbfc483c0c34e7eae9b89b51f2570

Libc-594.1.4.tar.gz
---

diff --git a/arm/pthreads/Makefile.inc b/arm/pthreads/Makefile.inc
index cc92564..4addcfe 100644
--- a/arm/pthreads/Makefile.inc
+++ b/arm/pthreads/Makefile.inc
@@ -4,4 +4,7 @@ MDSRCS += \
        pthread_set_self.s \
        pthread_self.s \
        pthread_getspecific.s \
-       init_cpu_capabilities.c
+       init_cpu_capabilities.c \
+       start_wqthread.s \
+       thread_start.s
+
diff --git a/arm/pthreads/start_wqthread.s b/arm/pthreads/start_wqthread.s
new file mode 100644
index 0000000..3cf471e
--- /dev/null
+++ b/arm/pthreads/start_wqthread.s
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2009 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+#include <arm/arch.h>
+
+#define	__APPLE_API_PRIVATE
+#include <machine/cpu_capabilities.h>
+#undef	__APPLE_API_PRIVATE
+
+// This routine is never called directly by user code, jumped from kernel
+// args 0 to 3 are already in the regs 0 to 3
+// should set stack with the 2 extra args before calling pthread_wqthread()
+// arg4 is in r[4]
+// arg5 is in r[5]
+
+        .text
+        .align 2
+        .globl _start_wqthread
+_start_wqthread:
+    stmfd sp!, {r4, r5}
+	bl __pthread_wqthread
diff --git a/arm/pthreads/thread_start.s b/arm/pthreads/thread_start.s
new file mode 100644
index 0000000..e7574d6
--- /dev/null
+++ b/arm/pthreads/thread_start.s
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2009 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+#include <arm/arch.h>
+
+#define	__APPLE_API_PRIVATE
+#include <machine/cpu_capabilities.h>
+#undef	__APPLE_API_PRIVATE
+
+// This routine is never called directly by user code, jumped from kernel
+// args 0 to 3 are already in the regs 0 to 3
+// should set stack with the 2 extra args before calling pthread_wqthread()
+// arg4 is in r[4]
+// arg5 is in r[5]
+  
+        .text
+        .align 2
+        .globl _thread_start
+_thread_start:
+    stmfd sp!, {r4, r5}
+	bl __pthread_start
diff --git a/arm/string/Makefile.inc b/arm/string/Makefile.inc
index c89ffa2..73dcb7f 100644
--- a/arm/string/Makefile.inc
+++ b/arm/string/Makefile.inc
@@ -4,10 +4,11 @@
 #
 .PATH: ${.CURDIR}/arm/string
 
-MDSRCS +=	\
-	bcopy.s \
-	bzero.s \
-	ffs.s \
+MDSRCS +=	 \
+	bcopy.s  \
+	bzero.s  \
+	ffs.s    \
+	memcmp.s \
 	strcmp.s \
 	strlen.s
 
@@ -15,4 +16,4 @@ MDSRCS +=	\
 MDSRCS += memset_pattern.s
 .endif
 
-SUPPRESSSRCS += memcpy.c memmove.c memset.c strlen.c
+SUPPRESSSRCS += bcmp.c memcpy.c memmove.c memset.c strlen.c
diff --git a/arm/string/NEON/bcopy.s b/arm/string/NEON/bcopy.s
new file mode 100644
index 0000000..30abab1
--- /dev/null
+++ b/arm/string/NEON/bcopy.s
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+/*****************************************************************************
+ * Cortex-A8 implementation                                                  *
+ *****************************************************************************/
+ 
+// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
+//
+// Our tests have shown that NEON is always a performance win for memcpy( ).
+// However, for the specific case of copies from a warm source to a cold
+// destination when the buffer size is between 1k and 32k, it is not enough
+// of a performance win to offset the increased power footprint, resulting
+// in an energy usage regression.  Thus, we detect that particular case, and
+// pass those copies through the ARM core registers.  All other copies larger
+// than 8 bytes are handled on NEON.
+//
+// Stephen Canon, August 2009
+
+.text
+.code 16
+.syntax unified
+
+// void bcopy(const void * source,
+//            void * destination,
+//            size_t length);
+//
+// void *memmove(void * destination,
+//               const void * source,
+//               size_t n);
+//
+// void *memcpy(void * restrict destination,
+//              const void * restrict source,
+//              size_t n);
+//
+// all copy n successive bytes from source to destination. memmove and memcpy
+// returns destination, whereas bcopy has no return value. copying takes place
+// as if it were through a temporary buffer -- after return destination contains
+// exactly the bytes from source, even if the buffers overlap.
+
+.thumb_func _bcopy
+.globl _bcopy    
+.thumb_func _memmove
+.globl _memmove
+.thumb_func _memcpy
+.globl _memcpy
+
+.align 2
+_bcopy:
+	mov       r3,      r0           // swap the first and second arguments
+	mov       r0,      r1           // and fall through into memmove
+	mov       r1,      r3           //
+
+.align 2
+_memmove:
+_memcpy:
+    subs      r3,      r0,  r1      // offset = destination addr - source addr
+    it        eq
+    bxeq      lr                    // if source == destination, early out
+
+//  Our preference is for using a (faster) front-to-back copy.  However, if
+//  0 < offset < length, it is necessary to copy back-to-front for correctness.
+//  We have already ruled out offset == 0, so we can use an unsigned compare
+//  with length -- if offset is higher, offset is either greater than length
+//  or negative.
+
+    cmp       r3,      r2
+    bhs       L_copyFrontToBack
+                             
+/*****************************************************************************
+ *  back to front copy                                                       *
+ *****************************************************************************/
+
+    mov       ip,      r0           // copy destination pointer.
+    add       r1,           r2      // move source pointer to end of source array
+    add       ip,           r2      // move destination pointer to end of dest array
+    
+    subs      r2,           $8      // if length - 8 is negative (i.e. length
+    blt       L_scalarReverseCopy   // is less than 8), jump to cleanup path.
+    tst       ip,           $7      // if (destination + length) is doubleword
+    beq       L_vectorReverseCopy   // aligned, jump to fast path.
+    
+0:  ldrb      r3,     [r1, $-1]!    // load byte
+    sub       r2,           $1      // decrement length
+    strb      r3,     [ip, $-1]!    // store byte
+    tst       ip,           $7      // test alignment
+    bne       0b
+    
+    cmp       r2,           $0      // if length - 8 is negative,
+    blt       L_scalarReverseCopy   // jump to the cleanup code
+                                    
+/*****************************************************************************
+ *  destination is doubleword aligned                                        *
+ *****************************************************************************/
+
+L_vectorReverseCopy:
+    ands      r3,      r1,  $3      // Extract the alignment of the source
+    bic       r1,           $3
+    tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
+0:  
+.short (L_reverseAligned0-0b)/2     // The NEON alignment hardware does not work
+.short (L_reverseAligned1-0b)/2     // properly with sub 4-byte alignment and
+.short (L_reverseAligned2-0b)/2     // buffers that are uncacheable, so we need
+.short (L_reverseAligned3-0b)/2     // to have a software workaround.
+
+/*****************************************************************************
+ *  source is also at least word aligned                                     *
+ *****************************************************************************/
+    
+L_reverseAligned0:
+    subs      r2,           $0x38   // if length - 64 is negative, jump to
+    blt       L_reverseVectorCleanup// the cleanup path.
+    tst       ip,           $0x38   // if (destination + length) is cacheline
+    beq       L_reverseCachelineAligned // aligned, jump to the fast path.
+    
+0:  sub       r1,           $8      // copy eight bytes at a time until the
+    vld1.32  {d0},    [r1]          // destination is 8 byte aligned.
+    sub       ip,           $8      //
+    sub       r2,           $8      //
+    tst       ip,           $0x38   //
+    vst1.64  {d0},    [ip, :64]     //
+    bne       0b                    //
+    
+    cmp       r2,           $0      // if length - 64 is negative,
+    blt       L_reverseVectorCleanup// jump to the cleanup code
+    
+L_reverseCachelineAligned:
+    sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
+    cmp       r3,          $0x7c00  // register copies instead of NEON to
+    blo       L_useSTMDB            // control energy usage.
+    
+    sub       r1,           $32     // decrement source
+    sub       ip,           $32     // decrement destination
+    mov       r3,           $-32    // load address increment
+    tst       r1,           $0x1f   // if source shares 32 byte alignment
+    beq       L_reverseSourceAligned// jump to loop with more alignment hints
+    
+    vld1.32  {q2,q3}, [r1], r3      // This loop handles 4-byte aligned copies
+    vld1.32  {q0,q1}, [r1], r3      // as generally as possible.
+    subs      r2,           $64     // 
+    vst1.64  {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always
+    blt       1f                    // properly handle misalignment in vld1
+.align 3                            // with an element size of 8 or 16, so
+0:  vld1.32  {q2,q3}, [r1], r3      // this is the best we can do without
+    vst1.64  {q0,q1}, [ip,:256], r3 // handling alignment in software.
+    vld1.32   {q0,q1}, [r1], r3     // 
+    subs      r2,           $64     // 
+    vst1.64  {q2,q3}, [ip,:256], r3 // 
+    bge       0b                    // 
+    b         1f                    // 
+    
+L_reverseSourceAligned:
+    vld1.64  {q2,q3}, [r1,:256], r3 // Identical to loop above except for
+    vld1.64  {q0,q1}, [r1,:256], r3 // additional alignment information; this
+    subs      r2,           $64     // gets an additional .5 bytes per cycle
+    vst1.64  {q2,q3}, [ip,:256], r3 // on Cortex-A8.
+    blt       1f                    // 
+.align 3                            // 
+0:  vld1.64  {q2,q3}, [r1,:256], r3 //
+    vst1.64  {q0,q1}, [ip,:256], r3 //
+    vld1.64  {q0,q1}, [r1,:256], r3 //
+    subs      r2,           $64     //
+    vst1.64  {q2,q3}, [ip,:256], r3 //
+    bge       0b                    //
+1:  vst1.64  {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store
+    add       r1,           $32     // point source at last element stored
+    add       ip,           $32     // point destination at last element stored
+    
+L_reverseVectorCleanup:
+    adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
+    blt       L_scalarReverseCopy   //
+
+0:  sub       r1,           $8      // copy eight bytes at a time until
+    vld1.32  {d0},    [r1]          // (length - 8) < 0.
+    sub       ip,           $8      //
+    subs      r2,           $8      //
+    vst1.64  {d0},    [ip, :64]     //
+    bge       0b                    //
+
+/*****************************************************************************
+ *  sub-doubleword cleanup copies                                            *
+ *****************************************************************************/
+
+L_scalarReverseCopy:
+    adds      r2,           #0x8    // restore length
+    it        eq                    // if this is zero
+    bxeq      lr                    // early out
+         
+0:  ldrb      r3,     [r1, #-1]!    // load a byte from source
+    strb      r3,     [ip, #-1]!    // store to destination
+    subs      r2,           #0x1    // subtract one from length
+    bne       0b                    // if non-zero, repeat
+    bx        lr                    // return
+         
+/*****************************************************************************
+ *  STMDB loop for 1k-32k buffers                                            *
+ *****************************************************************************/
+
+L_useSTMDB:
+    push     {r4-r8,r10,r11}
+.align 3
+0:  ldmdb	  r1!,  {r3-r8,r10,r11}
+    subs      r2,           #0x40
+    stmdb     ip!,  {r3-r8,r10,r11}
+    ldmdb	  r1!,  {r3-r8,r10,r11}
+	pld		 [r1, #-0x40]
+    stmdb     ip!,  {r3-r8,r10,r11}
+    bge       0b
+    pop      {r4-r8,r10,r11}
+    b         L_reverseVectorCleanup
+    
+/*****************************************************************************
+ *  Misaligned vld1 loop                                                     *
+ *****************************************************************************/
+
+// Software alignment fixup to handle source and dest that are relatively
+// misaligned mod 4 bytes.  Load two 4-byte aligned double words from source, 
+// use vext.8 to extract a double word to store, and perform an 8-byte aligned
+// store to destination.
+
+#define RCOPY_UNALIGNED(offset)      \
+    subs      r2,          $8       ;\
+    blt       2f                    ;\
+    sub       r1,          $8       ;\
+    sub       ip,          $8       ;\
+    mov       r3,          $-8      ;\
+    vld1.32  {d2,d3}, [r1], r3      ;\
+    subs      r2,          $8       ;\
+    blt       1f                    ;\
+0:  vext.8    d0,  d2, d3, $(offset);\
+    vmov      d3,      d2           ;\
+    vld1.32  {d2},    [r1], r3      ;\
+    subs      r2,          $8       ;\
+    vst1.64  {d0},    [ip, :64], r3 ;\
+    bge       0b                    ;\
+1:  vext.8    d0,  d2, d3, $(offset);\
+    add       r1,          $8       ;\
+    vst1.64  {d0},    [ip, :64]     ;\
+2:  add       r2,          $8       ;\
+    add       r1,          $(offset);\
+    b         L_scalarReverseCopy
+
+L_reverseAligned1:
+    RCOPY_UNALIGNED(1)
+L_reverseAligned2:
+    RCOPY_UNALIGNED(2)
+L_reverseAligned3:
+    RCOPY_UNALIGNED(3)
+
+/*****************************************************************************
+ *  front to back copy                                                       *
+ *****************************************************************************/
+
+L_copyFrontToBack:
+    mov       ip,      r0           // copy destination pointer.
+    subs      r2,           $8      // if length - 8 is negative (i.e. length
+    blt       L_scalarCopy          // is less than 8), jump to cleanup path.
+    tst       ip,           $7      // if the destination is doubleword
+    beq       L_vectorCopy          // aligned, jump to fast path.
+    
+0:  ldrb      r3,     [r1], $1      // load byte
+    sub       r2,           $1      // decrement length
+    strb      r3,     [ip], $1      // store byte
+    tst       ip,           $7      // test alignment
+    bne       0b
+    
+    cmp       r2,           $0      // if length - 8 is negative,
+    blt       L_scalarCopy          // jump to the cleanup code
+    
+/*****************************************************************************
+ *  destination is doubleword aligned                                        *
+ *****************************************************************************/
+
+L_vectorCopy:
+    ands      r3,      r1,  $3      // Extract the alignment of the source
+    bic       r1,           $3
+    tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
+0:  
+.short (L_sourceAligned0-0b)/2      // The NEON alignment hardware does not work
+.short (L_sourceAligned1-0b)/2      // properly with sub 4-byte alignment and
+.short (L_sourceAligned2-0b)/2      // buffers that are uncacheable, so we need
+.short (L_sourceAligned3-0b)/2      // to have a software workaround.
+
+/*****************************************************************************
+ *  source is also at least word aligned                                     *
+ *****************************************************************************/
+    
+L_sourceAligned0:
+    subs      r2,           $0x38   // If (length - 64) < 0
+    blt       L_vectorCleanup       //   jump to cleanup code
+    tst       ip,           $0x38   // If destination is 64 byte aligned
+    beq       L_cachelineAligned    //   jump to main loop
+    
+0:  vld1.32  {d0},    [r1]!         // Copy one double word at a time until
+    sub       r2,           $8      // the destination is 64-byte aligned.
+    vst1.64  {d0},    [ip, :64]!    //
+    tst       ip,           $0x38   //
+    bne       0b                    //
+    
+    cmp       r2,           $0      // If (length - 64) < 0, goto cleanup
+    blt       L_vectorCleanup       //
+    
+L_cachelineAligned:
+    sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
+    cmp       r3,          $0x7c00  // register copies instead of NEON to
+    blo       L_useSTMIA            // control energy usage.
+    tst       r1,           $0x1f   // If source has 32-byte alignment, use
+    beq       L_sourceAligned32     // an optimized loop.
+    
+    vld1.32  {q2,q3}, [r1]!         // This is the most common path for small
+    vld1.32  {q0,q1}, [r1]!         // copies, which are alarmingly frequent.
+    subs      r2,           #0x40   // It requires 4-byte alignment on the
+    vst1.64  {q2,q3}, [ip, :256]!   // source.  For ordinary malloc'd buffers,
+    blt       1f                    // this path could handle only single-byte
+.align 3                            // alignment at speed by using vld1.8
+0:  vld1.32  {q2,q3}, [r1]!         // instead of vld1.32; however, the NEON
+    vst1.64  {q0,q1}, [ip, :256]!   // alignment handler misbehaves for some
+    vld1.32  {q0,q1}, [r1]!         // special copies if the element size is
+    subs      r2,           #0x40   // 8 or 16, so we need to work around
+    vst1.64  {q2,q3}, [ip, :256]!   // sub 4-byte alignment in software, in
+    bge       0b                    // another code path.
+    b         1f
+    
+L_sourceAligned32:
+    vld1.64  {q2,q3}, [r1, :256]!   // When the source shares 32-byte alignment
+    vld1.64  {q0,q1}, [r1, :256]!   // with the destination, we use this loop
+    subs      r2,           #0x40   // instead, which specifies the maximum
+    vst1.64  {q2,q3}, [ip, :256]!   // :256 alignment on all loads and stores.
+    blt       1f                    // 
+.align 3                            // This gets an additional .5 bytes per
+0:  vld1.64  {q2,q3}, [r1, :256]!   // cycle for in-cache copies, which is not
+    vst1.64  {q0,q1}, [ip, :256]!   // insignificant for this (rather common)
+    vld1.64  {q0,q1}, [r1, :256]!   // case.
+    subs      r2,           #0x40   // 
+    vst1.64  {q2,q3}, [ip, :256]!   // This is identical to the above loop,
+    bge       0b                    // except for the additional alignment.
+1:  vst1.64  {q0,q1}, [ip, :256]!   // 
+
+L_vectorCleanup:
+    adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
+    blt       L_scalarCopy          //
+    
+0:  vld1.32  {d0},    [r1]!         // Copy one doubleword at a time until
+    subs      r2,           $8      // (length - 8) < 0.
+    vst1.64  {d0},    [ip, :64]!    //
+    bge       0b                    //
+
+/*****************************************************************************
+ *  sub-doubleword cleanup copies                                            *
+ *****************************************************************************/
+
+L_scalarCopy:
+    adds      r2,           #0x8    // restore length
+    it        eq                    // if this is zero
+    bxeq      lr                    // early out
+         
+0:  ldrb      r3,     [r1], #1      // load a byte from source
+    strb      r3,     [ip], #1      // store to destination
+    subs      r2,           #1      // subtract one from length
+    bne       0b                    // if non-zero, repeat
+    bx        lr                    // return
+    
+/*****************************************************************************
+ *  STMIA loop for 1k-32k buffers                                            *
+ *****************************************************************************/
+
+L_useSTMIA:
+    push     {r4-r8,r10,r11}
+.align 3
+0:  ldmia     r1!,  {r3-r8,r10,r11}
+    subs      r2,      r2,  #64
+    stmia     ip!,  {r3-r8,r10,r11}
+    ldmia     r1!,  {r3-r8,r10,r11}
+    pld      [r1, #64]
+    stmia     ip!,  {r3-r8,r10,r11}
+    bge       0b
+    pop      {r4-r8,r10,r11}
+    b         L_vectorCleanup
+    
+/*****************************************************************************
+ *  Misaligned reverse vld1 loop                                             *
+ *****************************************************************************/
+
+// Software alignment fixup to handle source and dest that are relatively
+// misaligned mod 4 bytes.  Load two 4-byte aligned double words from source, 
+// use vext.8 to extract a double word to store, and perform an 8-byte aligned
+// store to destination.
+
+#define COPY_UNALIGNED(offset)       \
+    subs      r2,          $8       ;\
+    blt       2f                    ;\
+    vld1.32  {d2,d3}, [r1]!         ;\
+    subs      r2,          $8       ;\
+    blt       1f                    ;\
+0:  vext.8    d0,  d2, d3, $(offset);\
+    vmov      d2,      d3           ;\
+    vld1.32  {d3},    [r1]!         ;\
+    subs      r2,          $8       ;\
+    vst1.64  {d0},    [ip, :64]!    ;\
+    bge       0b                    ;\
+1:  vext.8    d0,  d2, d3, $(offset);\
+    sub       r1,          $8       ;\
+    vst1.64  {d0},    [ip, :64]!    ;\
+2:  add       r1,          $(offset);\
+    add       r2,          $8       ;\
+    b         L_scalarCopy
+
+L_sourceAligned1:
+    COPY_UNALIGNED(1)
+L_sourceAligned2:
+    COPY_UNALIGNED(2)
+L_sourceAligned3:
+    COPY_UNALIGNED(3)
diff --git a/arm/string/NEON/bzero.s b/arm/string/NEON/bzero.s
new file mode 100644
index 0000000..50b1c8e
--- /dev/null
+++ b/arm/string/NEON/bzero.s
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+/**********************************************************************
+ * Cortex-A8 implementation                                           *
+ **********************************************************************/
+
+// Cortex-A8 implementations of memset( ) and bzero( ).  Main loop is 64-byte
+// NEON stores, unless the buffer length is > 1k.  Beyond that point, there is
+// little to no speed advantage with NEON (and a slight regression in some
+// measured cases), so we switch to the GPRs.
+//
+// The crossover point should be reevaluated for future architectures.
+//
+// -- Stephen Canon, August 2009
+
+.text
+.syntax unified
+.code 16
+
+// void bzero(void * destination,
+//            size_t length);
+//
+// zeros out a buffer length bytes long, beginning at the address destination.
+.thumb_func ___bzero
+.globl ___bzero
+.thumb_func _bzero
+.globl _bzero
+.align 2
+___bzero:
+_bzero:
+    mov     r2,     r1              // match the API to memset(dest, 0, length)
+    eor     r1,     r1              // and fall through into memset
+
+// void *memset(void * destination,
+//              int value, size_t n);
+//
+// writes value converted to an unsigned char to n successive bytes, beginning
+// at destination.
+
+// Notes on register usage:
+// 
+// Throughout this function, registers have nearly constant usage; the pattern
+// is:
+//
+//     r0 holds the original destination pointer, unmodified.  This value
+//        must be returned by the routine, so it is easiest to just leave it
+//        in place.
+//     r1 holds the value that is being copied into the buffer, in some stage
+//        of splattedness.  The low byte is guaranteed to always have the value
+//        but the higher bytes may or may not contain copies of it.
+//     r2 holds the length minus some offset, where the offset is always the
+//        number of bytes that the current loop stores per iteration.
+//     r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted
+//        copies of the value to be stored.
+//     ip holds a pointer to the lowest byte in the array that has not yet been
+//        set to hold value.
+//     q0 and q1 hold splatted copies of the value in the vector path, and are
+//        otherwise unused.
+
+.thumb_func _memset
+.globl _memset
+.align 2
+_memset:
+    mov       ip,      r0           // copy destination pointer.
+    subs      r2,           #0x8    // if length - 8 is negative (i.e. length
+    and       r1,           #0xff   // is less than 8), jump to cleanup path.
+    blt       L_scalarCleanup       // 
+    
+    tst       ip,           #0x7    // if the destination is doubleword
+    beq       L_vectorCopy          // aligned, jump to fast path.
+    
+0:  strb      r1,     [ip], #1      // store one byte at a time until
+    sub       r2,           #1      // destination pointer is 8 byte aligned.
+    tst       ip,           #7      //
+    bne       0b                    //
+    
+    cmp       r2,           #0x0    // if length - 8 is negative,
+    blt       L_scalarCleanup       // jump to the cleanup code
+
+L_vectorCopy:
+    vdup.8    q0,      r1           // splat the byte to be stored across
+    subs      r2,           #0x38   // q0 and q1, and check if length - 64
+    vmov      q1,      q0           // is negative; if so, jump to the
+    blt       L_vectorCleanup       // cleanup code.
+    
+    tst       ip,           #0x38   // if the destination is cacheline
+    beq       L_cachelineAligned    // aligned, jump to the fast path.
+
+0:  vst1.64  {d0},    [ip, :64]!    // store one double word at a time until
+    sub       r2,           #8      // the destination is 64-byte aligned
+    tst       ip,           #0x38   // 
+    bne       0b
+    
+    cmp       r2,           #0x0    // if length - 64 is negative,
+    blt       L_vectorCleanup       // jump to the cleanup code
+
+L_cachelineAligned:
+    cmp       r2,           #0x3c0  // if length > 1024
+    bge       L_useSTMIA            // we use stmia instead
+
+.align 4                            // main loop
+0:  vst1.64  {q0,q1}, [ip, :256]!   // store 32 bytes
+    subs      r2,           #0x40   // decrement length by 64
+    vst1.64  {q0,q1}, [ip, :256]!   // store 32 bytes
+    bge       0b                    // if length - 64 >= 0, continue
+    
+L_vectorCleanup:
+    adds      r2,           #0x38   // if (length - 8) < 0, goto scalar cleanup
+    blt       L_scalarCleanup       //
+    
+0:  subs      r2,           #8      // store one double word at a time until
+    vst1.64  {d0},    [ip, :64]!    // (length - 8) < 0.
+    bge       0b
+    
+L_scalarCleanup:
+    adds      r2,           #8      // restore length
+    beq       1f                    // early out if zero.
+    
+0:  strb      r1,     [ip], #1      // store one byte at a time until length
+    subs      r2,           #1      // is zero.
+    bne       0b                    //
+1:  bx        lr                    // return.
+
+//  STMIA loop for large buffers
+//
+//  For stores larger than 1024 bytes, we use STMIA because we can't get enough
+//  of a speedup from NEON to offset the higher power draw of the NEON unit.
+//
+//  This crossover should be reevaluated on future architectures.
+//
+//  We avoid using r7 and r9 even though it's not strictly necessary.
+
+L_useSTMIA:
+    push     {r4,r5,r6,r8,r10,r11}
+    orr       r1,      r1,  r1, lsl #8
+    orr       r1,      r1,  r1, lsl #16
+    mov       r3,      r1
+    mov       r4,      r1
+    mov       r5,      r1
+    mov       r6,      r1
+    mov       r8,      r1
+    mov       r10,     r1
+    mov       r11,     r1
+.align 4
+0:  stmia     ip!,  {r1,r3,r4,r5,r6,r8,r10,r11}
+    subs      r2,           #0x40
+    stmia     ip!,  {r1,r3,r4,r5,r6,r8,r10,r11}
+    bge       0b
+    pop      {r4,r5,r6,r8,r10,r11}
+    b         L_vectorCleanup
diff --git a/arm/string/bcopy.s b/arm/string/bcopy.s
index da24152..2e67e1c 100644
--- a/arm/string/bcopy.s
+++ b/arm/string/bcopy.s
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
@@ -21,6 +21,19 @@
  * @APPLE_LICENSE_HEADER_END@
  */
 
+#if defined __thumb2__ && defined __ARM_NEON__
+    
+// Use our tuned NEON implementation when it is available.  Otherwise fall back
+// on more generic ARM code.
+
+#include "NEON/bcopy.s"
+    
+#else // defined __thumb2__ && defined __ARM_NEON__
+
+/*****************************************************************************
+ * ARMv5 and ARMv6 implementation                                            *
+ *****************************************************************************/
+ 
 #include <arm/arch.h>
 
 .text
@@ -398,4 +411,5 @@ Lalign3_forward_loop:
 Lexit:
 	ldmfd	sp!, {r0, r4, r5, r7, pc}
 
+#endif // defined __thumb2__ && defined __ARM_NEON__
 
diff --git a/arm/string/bzero.s b/arm/string/bzero.s
index ada3727..e3a3a8d 100644
--- a/arm/string/bzero.s
+++ b/arm/string/bzero.s
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
@@ -20,6 +20,15 @@
  * 
  * @APPLE_LICENSE_HEADER_END@
  */
+ 
+#if defined __thumb2__ && defined __ARM_NEON__
+
+// Use our tuned NEON implementation when it is available.  Otherwise fall back
+// on more generic ARM code.
+
+#include "NEON/bzero.s"
+
+#else // defined __thumb2__ && defined __ARM_NEON__
 
 #include <mach/machine/asm.h>
 #include <architecture/arm/asm_help.h>
@@ -160,3 +169,5 @@ L_unaligned:
 	b		L_lessthan64aligned
 
 X_LEAF(___bzero, _bzero)
+
+#endif // defined __thumb2__ && defined __ARM_NEON__
diff --git a/arm/string/memcmp.s b/arm/string/memcmp.s
new file mode 100644
index 0000000..83e0f87
--- /dev/null
+++ b/arm/string/memcmp.s
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+// ARM Assembly implementation of memcmp( ) from <string.h>
+// Uses Thumb2 if it is available, otherwise generates ARM code.
+//
+// -- Stephen Canon, August 2009
+//
+// The basic idea is to use word compares instead of byte compares as long as
+// at least four bytes remain to be compared.  However, because memcmp( )
+// compares the buffers as though they were big-endian unsigned integers, we
+// need to byte-reverse each word before comparing them.
+//
+// If the buffers are not word aligned, or they are shorter than four bytes,
+// we just use a simple byte comparison loop instead.
+//
+// int   bcmp(void *src1, void *src2, size_t length);
+// int memcmp(void *src1, void *src2, size_t length);
+
+#include <arm/arch.h>
+
+    .text
+    .syntax unified
+#if defined __thumb2__
+    .code 16
+    .thumb_func _bcmp
+    .thumb_func _memcmp
+#else
+    .code 32
+#endif
+    .globl _bcmp
+    .globl _memcmp
+    .align 3
+_bcmp:
+_memcmp:
+
+#ifdef _ARM_ARCH_6
+    subs    ip,     r2,  #4     // if length < 4
+    bmi     L_useByteCompares   // jump to the byte comparison loop
+    
+    orr     r3,     r0,  r1     // if the buffers are
+    tst     r3,          #3     // not word aligned
+    bne     L_useByteCompares   // jump to the byte comparison loop
+
+.align 3
+L_wordCompare:                  // Here we know that both buffers are word
+    ldr     r2,    [r0], #4     // aligned, and (length - 4) > 0, so at least
+    ldr     r3,    [r1], #4     // four bytes remain to be compared.  We load
+    subs    ip,          #4     // a word from each buffer, and byte reverse
+    bmi     L_lastWord          // the loaded words.  We also decrement the
+    rev     r2,     r2          // length by four and jump out of this loop if
+    rev     r3,     r3          // the result is negative.  Then we compare the
+    cmp     r2,     r3          // reversed words, and continue the loop only
+    beq     L_wordCompare       // if they are equal.
+L_wordsUnequal:
+    ite     hi                  // If the words compared unequal, return +/- 1
+    movhi   r0,     #1          // according to the result of the comparison.
+    movls   r0,     #-1         //
+    bx      lr                  //
+L_lastWord:
+    rev     r2,     r2          // If we just loaded the last complete words
+    rev     r3,     r3          // from the buffers, byte-reverse them and
+    cmp     r2,     r3          // compare.  If they are unequal, jump to the
+    bne     L_wordsUnequal      // return path.
+    add     r2,     ip,  #4     // Otherwise, fall into the cleanup code.
+#endif // _ARM_ARCH_6
+
+L_useByteCompares:
+    tst     r2,     r2          // If the length is exactly zero
+    beq     L_returnZero        // avoid doing any loads and return zero.
+    mov     r3,     r0
+.align 3
+L_byteCompareLoop:
+    ldrb    r0,    [r3], #1     // Load a byte from each buffer, and decrement
+    ldrb    ip,    [r1], #1     // the length by one.  If the decremented
+    subs    r2,     #1          // length is zero, exit the loop.  Otherwise
+    beq     L_lastByte          // subtract the loaded bytes; if their
+    subs    r0,     ip          // difference is zero, continue the comparison
+    beq     L_byteCompareLoop   // loop.  Otherwise, return their difference.
+    bx      lr
+L_returnZero:
+    mov     r0,     ip
+L_lastByte:
+    sub     r0,     ip          // Return the difference of the final bytes
+    bx      lr
diff --git a/arm/sys/OSAtomic-v4.c b/arm/sys/OSAtomic-v4.c
index c725cb4..723d84f 100644
--- a/arm/sys/OSAtomic-v4.c
+++ b/arm/sys/OSAtomic-v4.c
@@ -187,6 +187,30 @@ bool    OSAtomicCompareAndSwap32Barrier( int32_t oldValue, int32_t newValue, vol
     return OSAtomicCompareAndSwap32(oldValue, newValue, theValue);
 }
 
+bool
+OSAtomicCompareAndSwapInt(int oldValue, int newValue, volatile int *theValue)
+{
+	return OSAtomicCompareAndSwap32(oldValue, newValue, theValue);
+}
+
+bool
+OSAtomicCompareAndSwapIntBarrier(int oldValue, int newValue, volatile int *theValue)
+{
+	return OSAtomicCompareAndSwap32(oldValue, newValue, theValue);
+}
+
+bool
+OSAtomicCompareAndSwapLong(long oldValue, long newValue, volatile long *theValue)
+{
+	return OSAtomicCompareAndSwap32(oldValue, newValue, (volatile int32_t *)theValue);
+}
+
+bool
+OSAtomicCompareAndSwapLongBarrier(long oldValue, long newValue, volatile long *theValue)
+{
+	return OSAtomicCompareAndSwap32(oldValue, newValue, (volatile int32_t *)theValue);
+}
+
 bool	OSAtomicCompareAndSwap64( int64_t oldValue, int64_t newValue, volatile int64_t *theValue )
 {
     bool result;
diff --git a/gdtoa/FreeBSD/gdtoa-misc.c.patch b/gdtoa/FreeBSD/gdtoa-misc.c.patch
index 8974654..261ec59 100644
--- a/gdtoa/FreeBSD/gdtoa-misc.c.patch
+++ b/gdtoa/FreeBSD/gdtoa-misc.c.patch
@@ -1,5 +1,5 @@
---- gdtoa-misc.c.orig	2008-11-05 15:59:34.000000000 -0800
-+++ gdtoa-misc.c	2008-11-05 16:05:28.000000000 -0800
+--- gdtoa-misc.c.orig	2010-01-07 22:03:21.000000000 -0800
++++ gdtoa-misc.c	2010-01-07 22:25:33.000000000 -0800
 @@ -29,9 +29,20 @@ THIS SOFTWARE.
  /* Please send bug reports to David M. Gay (dmg at acm dot org,
   * with " at " changed at "@" and " dot " changed to ".").	*/
@@ -48,7 +48,7 @@
   Bigint *
  Balloc
  #ifdef KR_headers
-@@ -53,8 +84,25 @@ Balloc
+@@ -53,9 +84,26 @@ Balloc
  #ifndef Omit_Private_Memory
  	unsigned int len;
  #endif
@@ -70,10 +70,21 @@
 +		}
 +#else /* !GDTOA_TSD */
  	ACQUIRE_DTOA_LOCK(0);
+-	if ( (rv = freelist[k]) !=0) {
 +#endif /* GDTOA_TSD */
- 	if ( (rv = freelist[k]) !=0) {
++	if (k <= Kmax && (rv = freelist[k]) !=0) {
  		freelist[k] = rv->next;
  		}
+ 	else {
+@@ -65,7 +113,7 @@ Balloc
+ #else
+ 		len = (sizeof(Bigint) + (x-1)*sizeof(ULong) + sizeof(double) - 1)
+ 			/sizeof(double);
+-		if (pmem_next - private_mem + len <= PRIVATE_mem) {
++		if (k <= Kmax && pmem_next - private_mem + len <= PRIVATE_mem) {
+ 			rv = (Bigint*)pmem_next;
+ 			pmem_next += len;
+ 			}
 @@ -75,7 +123,9 @@ Balloc
  		rv->k = k;
  		rv->maxwds = x;
@@ -84,20 +95,28 @@
  	rv->sign = rv->wds = 0;
  	return rv;
  	}
-@@ -89,10 +139,16 @@ Bfree
+@@ -89,10 +139,20 @@ Bfree
  #endif
  {
  	if (v) {
+-		ACQUIRE_DTOA_LOCK(0);
+-		v->next = freelist[v->k];
+-		freelist[v->k] = v;
+-		FREE_DTOA_LOCK(0);
++		if (v->k > Kmax)
++			free((void*)v);
++		else {
 +#ifdef GDTOA_TSD
-+		Bigint **freelist = (Bigint **)pthread_getspecific(gdtoa_tsd_key);
++			Bigint **freelist = (Bigint **)pthread_getspecific(gdtoa_tsd_key);
 +#else /* !GDTOA_TSD */
- 		ACQUIRE_DTOA_LOCK(0);
++			ACQUIRE_DTOA_LOCK(0);
 +#endif /* GDTOA_TSD */
- 		v->next = freelist[v->k];
- 		freelist[v->k] = v;
++			v->next = freelist[v->k];
++			freelist[v->k] = v;
 +#ifndef GDTOA_TSD
- 		FREE_DTOA_LOCK(0);
++			FREE_DTOA_LOCK(0);
 +#endif /* GDTOA_TSD */
++			}
  		}
  	}
  
diff --git a/gdtoa/gdtoa-misc-fbsd.c b/gdtoa/gdtoa-misc-fbsd.c
index 8540a0c..659f69c 100644
--- a/gdtoa/gdtoa-misc-fbsd.c
+++ b/gdtoa/gdtoa-misc-fbsd.c
@@ -103,7 +103,7 @@ Balloc
 #else /* !GDTOA_TSD */
 	ACQUIRE_DTOA_LOCK(0);
 #endif /* GDTOA_TSD */
-	if ( (rv = freelist[k]) !=0) {
+	if (k <= Kmax && (rv = freelist[k]) !=0) {
 		freelist[k] = rv->next;
 		}
 	else {
@@ -113,7 +113,7 @@ Balloc
 #else
 		len = (sizeof(Bigint) + (x-1)*sizeof(ULong) + sizeof(double) - 1)
 			/sizeof(double);
-		if (pmem_next - private_mem + len <= PRIVATE_mem) {
+		if (k <= Kmax && pmem_next - private_mem + len <= PRIVATE_mem) {
 			rv = (Bigint*)pmem_next;
 			pmem_next += len;
 			}
@@ -139,16 +139,20 @@ Bfree
 #endif
 {
 	if (v) {
+		if (v->k > Kmax)
+			free((void*)v);
+		else {
 #ifdef GDTOA_TSD
-		Bigint **freelist = (Bigint **)pthread_getspecific(gdtoa_tsd_key);
+			Bigint **freelist = (Bigint **)pthread_getspecific(gdtoa_tsd_key);
 #else /* !GDTOA_TSD */
-		ACQUIRE_DTOA_LOCK(0);
+			ACQUIRE_DTOA_LOCK(0);
 #endif /* GDTOA_TSD */
-		v->next = freelist[v->k];
-		freelist[v->k] = v;
+			v->next = freelist[v->k];
+			freelist[v->k] = v;
 #ifndef GDTOA_TSD
-		FREE_DTOA_LOCK(0);
+			FREE_DTOA_LOCK(0);
 #endif /* GDTOA_TSD */
+			}
 		}
 	}
 
diff --git a/gen/asl.c b/gen/asl.c
index 5a18acd..8f1f34d 100644
--- a/gen/asl.c
+++ b/gen/asl.c
@@ -84,6 +84,7 @@
 time_t asl_parse_time(const char *);
 const char *asl_syslog_faciliy_num_to_name(int n);
 __private_extern__ asl_client_t *_asl_open_default();
+__private_extern__ int _asl_send_level_message(aslclient ac, aslmsg msg, int level, const char *message);
 
 /* notify SPI */
 uint32_t notify_register_plain(const char *name, int *out_token);
@@ -2312,17 +2313,6 @@ asl_vlog(aslclient ac, aslmsg a, int level, const char *format, va_list ap)
 	if (level < ASL_LEVEL_EMERG) level = ASL_LEVEL_EMERG;
 	if (level > ASL_LEVEL_DEBUG) level = ASL_LEVEL_DEBUG;
 
-	str = NULL;
-	asprintf(&str, "%d", level);
-	if (str == NULL)
-	{
-		if ((msg != NULL) && (my_msg != 0)) asl_free(msg);
-		return -1;
-	}
-
-	asl_set(msg, ASL_KEY_LEVEL, str);
-	free(str);
-
 	/* insert strerror for %m */
 	len = 0;
 	elen = 0;
@@ -2409,11 +2399,9 @@ asl_vlog(aslclient ac, aslmsg a, int level, const char *format, va_list ap)
 		return -1;
 	}
 
-	asl_set(msg, ASL_KEY_MSG, str);
+	status = _asl_send_level_message(ac, (aslmsg)msg, level, str);
 	free(str);
 
-	status = asl_send(ac, (aslmsg)msg);
-
 	if ((msg != NULL) && (my_msg != 0)) asl_free(msg);
 	return status;
 }
@@ -2725,18 +2713,17 @@ asl_format_message(aslmsg msg, const char *mfmt, const char *tfmt, uint32_t text
 }
 
 /*
- * asl_send: send a message 
+ * asl_send (internal version): send a message 
  * This routine may be used instead of asl_log() or asl_vlog() if asl_set() 
  * has been used to set all of a message's attributes.
- * msg:  an aslmsg
  * returns 0 for success, non-zero for failure
  */
-int
-asl_send(aslclient ac, aslmsg msg)
+__private_extern__ int
+_asl_send_level_message(aslclient ac, aslmsg msg, int level, const char *message)
 {
 	char *str, *out_raw;
 	caddr_t out;
-	uint32_t i, len, outlen, level, lmask, outstatus, filter, check, senderx, facilityx;
+	uint32_t i, len, outlen, lmask, outstatus, filter, check, senderx, facilityx;
 	uint64_t v64;
 	const char *val;
 	char *name, *x;
@@ -2745,7 +2732,7 @@ asl_send(aslclient ac, aslmsg msg)
 	int status, rc_filter;
 	asl_client_t *asl;
 	int use_global_lock;
-	asl_msg_t *mt;
+	asl_msg_t *mt, *tmp_msg;
 	char hname[_POSIX_HOST_NAME_MAX];
 	kern_return_t kstatus;
 
@@ -2760,8 +2747,6 @@ asl_send(aslclient ac, aslmsg msg)
 
 	if (msg == NULL) return 0;
 
-	level = ASL_LEVEL_DEBUG;
-
 	val = asl_get(msg, ASL_KEY_LEVEL);
 	if (val != NULL) level = atoi(val);
 
@@ -2814,6 +2799,26 @@ asl_send(aslclient ac, aslmsg msg)
 		rc_filter = 1;
 	}
 
+	/*
+	 * Copy the message to tmp_msg to make setting values thread-safe
+	 */
+	tmp_msg = calloc(1, sizeof(asl_msg_t));
+	if (tmp_msg == NULL) return -1;
+
+	tmp_msg->type = ASL_TYPE_MSG;
+
+	mt = (asl_msg_t *)msg;
+	for (i = 0; i < mt->count; i++)
+	{
+		asl_set(tmp_msg, mt->key[i], mt->val[i]);
+	}
+
+	/*
+	 * Set Level and Message from parameters.
+	 */
+	if (message != NULL) asl_set(tmp_msg, ASL_KEY_MSG, message);
+	asl_set(tmp_msg, ASL_KEY_LEVEL, _asl_level_string(level));
+
 	/* 
 	 * Time, TimeNanoSec, Host, PID, UID, and GID values get set here
 	 */
@@ -2826,7 +2831,7 @@ asl_send(aslclient ac, aslmsg msg)
 		asprintf(&str, "%lu", tval.tv_sec);
 		if (str != NULL)
 		{
-			asl_set(msg, ASL_KEY_TIME, str);
+			asl_set(tmp_msg, ASL_KEY_TIME, str);
 			free(str);
 			str = NULL;
 		}
@@ -2834,7 +2839,7 @@ asl_send(aslclient ac, aslmsg msg)
 		asprintf(&str, "%lu", tval.tv_usec * 1000);
 		if (str != NULL)
 		{
-			asl_set(msg, ASL_KEY_TIME_NSEC, str);
+			asl_set(tmp_msg, ASL_KEY_TIME_NSEC, str);
 			free(str);
 			str = NULL;
 		}
@@ -2845,7 +2850,7 @@ asl_send(aslclient ac, aslmsg msg)
 		asprintf(&str, "%lu", tick);
 		if (str != NULL)
 		{
-			asl_set(msg, ASL_KEY_TIME, str);
+			asl_set(tmp_msg, ASL_KEY_TIME, str);
 			free(str);
 			str = NULL;
 		}
@@ -2854,14 +2859,14 @@ asl_send(aslclient ac, aslmsg msg)
 	memset(&hname, 0, _POSIX_HOST_NAME_MAX);
 	if (gethostname(hname, _POSIX_HOST_NAME_MAX) == 0)
 	{
-		asl_set(msg, ASL_KEY_HOST, hname);
+		asl_set(tmp_msg, ASL_KEY_HOST, hname);
 	}
 
 	str = NULL;
 	asprintf(&str, "%u", getpid());
 	if (str != NULL)
 	{
-		asl_set(msg, ASL_KEY_PID, str);
+		asl_set(tmp_msg, ASL_KEY_PID, str);
 		free(str);
 	}
 
@@ -2869,7 +2874,7 @@ asl_send(aslclient ac, aslmsg msg)
 	asprintf(&str, "%d", getuid());
 	if (str != NULL)
 	{
-		asl_set(msg, ASL_KEY_UID, str);
+		asl_set(tmp_msg, ASL_KEY_UID, str);
 		free(str);
 	}
 
@@ -2877,30 +2882,29 @@ asl_send(aslclient ac, aslmsg msg)
 	asprintf(&str, "%u", getgid());
 	if (str != NULL)
 	{
-		asl_set(msg, ASL_KEY_GID, str);
+		asl_set(tmp_msg, ASL_KEY_GID, str);
 		free(str);
 	}
 
 	senderx = (uint32_t)-1;
 	facilityx = (uint32_t)-1;
-	mt = (asl_msg_t *)msg;
 
-	for (i = 0; (i < mt->count) && ((senderx == (uint32_t)-1) || (facilityx == (uint32_t)-1)); i++)
+	for (i = 0; (i < tmp_msg->count) && ((senderx == (uint32_t)-1) || (facilityx == (uint32_t)-1)); i++)
 	{
-		if (mt->key[i] == NULL) continue;
-		if (streq(mt->key[i], ASL_KEY_SENDER)) senderx = i;
-		else if (streq(mt->key[i], ASL_KEY_FACILITY)) facilityx = i;
+		if (tmp_msg->key[i] == NULL) continue;
+		if (streq(tmp_msg->key[i], ASL_KEY_SENDER)) senderx = i;
+		else if (streq(tmp_msg->key[i], ASL_KEY_FACILITY)) facilityx = i;
 	}
 
 	/*
 	 * Set Sender if needed
 	 */
-	if ((senderx == (uint32_t)-1) || (mt->val[senderx] == NULL))
+	if ((senderx == (uint32_t)-1) || (tmp_msg->val[senderx] == NULL))
 	{
 		if ((ac != NULL) && (ac->name != NULL))
 		{
 			/* Use the Sender name from the client handle */
-			asl_set(msg, ASL_KEY_SENDER, ac->name);
+			asl_set(tmp_msg, ASL_KEY_SENDER, ac->name);
 		}
 		else
 		{
@@ -2921,20 +2925,20 @@ asl_send(aslclient ac, aslmsg msg)
 				}
 			}
 
-			if (_asl_global.sender != NULL) asl_set(msg, ASL_KEY_SENDER, _asl_global.sender);
-			else asl_set(msg, ASL_KEY_SENDER, "Unknown");
+			if (_asl_global.sender != NULL) asl_set(tmp_msg, ASL_KEY_SENDER, _asl_global.sender);
+			else asl_set(tmp_msg, ASL_KEY_SENDER, "Unknown");
 		}
 	}
 
 	/*
 	 * Set Facility
 	 */
-	if ((facilityx == (uint32_t)-1) || (mt->val[facilityx] == NULL))
+	if ((facilityx == (uint32_t)-1) || (tmp_msg->val[facilityx] == NULL))
 	{
 		if ((ac != NULL) && (ac->facility != NULL))
 		{
 			/* Use the Facility name from the client handle */
-			asl_set(msg, ASL_KEY_FACILITY, ac->facility);
+			asl_set(tmp_msg, ASL_KEY_FACILITY, ac->facility);
 		}
 	}
 
@@ -2944,7 +2948,7 @@ asl_send(aslclient ac, aslmsg msg)
 		val = asl_get(msg, ASL_KEY_OPTION);
 		if (val == NULL)
 		{
-			asl_set(msg, ASL_KEY_OPTION, ASL_OPT_STORE);
+			asl_set(tmp_msg, ASL_KEY_OPTION, ASL_OPT_STORE);
 		}
 		else
 		{
@@ -2952,7 +2956,7 @@ asl_send(aslclient ac, aslmsg msg)
 			asprintf(&str, "%s %s", ASL_OPT_STORE, val);
 			if (str != NULL)
 			{
-				asl_set(msg, ASL_KEY_OPTION, str);
+				asl_set(tmp_msg, ASL_KEY_OPTION, str);
 				free(str);
 				str = NULL;
 			}
@@ -2966,7 +2970,7 @@ asl_send(aslclient ac, aslmsg msg)
 	if ((filter != 0) && ((filter & lmask) != 0))
 	{
 		len = 0;
-		out_raw = asl_msg_to_string((asl_msg_t *)msg, &len);
+		out_raw = asl_msg_to_string(tmp_msg, &len);
 
 		if ((out_raw != NULL) && (len != 0))
 		{
@@ -3011,7 +3015,7 @@ asl_send(aslclient ac, aslmsg msg)
 		if (asl->fd_list[i] < 0) continue;
 
 		len = 0;
-		out = asl_format_message(msg, asl->fd_mfmt[i], asl->fd_tfmt[i], asl->fd_encoding[i], &len);
+		out = asl_format_message(tmp_msg, asl->fd_mfmt[i], asl->fd_tfmt[i], asl->fd_encoding[i], &len);
 		if (out == NULL) continue;
 
 		status = write(asl->fd_list[i], out, len - 1);
@@ -3024,11 +3028,23 @@ asl_send(aslclient ac, aslmsg msg)
 		free(out);
 	}
 
+	asl_free((aslmsg)tmp_msg);
+
 	if (use_global_lock != 0) pthread_mutex_unlock(&_asl_global.lock);
 
 	return outstatus;
 }
 
+/*
+ * asl_send: send a message 
+ * returns 0 for success, non-zero for failure
+ */
+int
+asl_send(aslclient ac, aslmsg msg)
+{
+	return _asl_send_level_message(ac, msg, ASL_LEVEL_DEBUG, NULL);
+}
+
 char *
 asl_msg_string(aslmsg a)
 {
diff --git a/gen/magazine_malloc.c b/gen/magazine_malloc.c
index 402510c..a1fb6f0 100644
--- a/gen/magazine_malloc.c
+++ b/gen/magazine_malloc.c
@@ -1061,6 +1061,14 @@ mag_get_thread_index(szone_t *szone)
 	return CPU_NUMBER() & (TINY_MAX_MAGAZINES - 1);
 }
 
+#elif defined(__arm__)
+
+static INLINE mag_index_t
+mag_get_thread_index(szone_t *szone)
+{
+    return 0;
+}
+
 #else
 #warning deriving magazine index from pthread_self() [want processor number]
 
diff --git a/gen/stack_logging_disk.c b/gen/stack_logging_disk.c
index 83c882f..aa0bf28 100644
--- a/gen/stack_logging_disk.c
+++ b/gen/stack_logging_disk.c
@@ -285,10 +285,13 @@ __expand_uniquing_table(backtrace_uniquing_table *uniquing_table)
 static int 
 __enter_frames_in_table(backtrace_uniquing_table *uniquing_table, uint64_t *foundIndex, mach_vm_address_t *frames, int32_t count)
 {	
+	// The hash values need to be the same size as the addresses (because we use the value -1), for clarity, define a new type
+	typedef mach_vm_address_t hash_index_t;
+
 	mach_vm_address_t thisPC;
-	uint64_t hash, uParent = (uint64_t)(-1ll), modulus = (uniquing_table->numNodes-uniquing_table->untouchableNodes-1);
+	hash_index_t hash, uParent = (hash_index_t)(-1ll), modulus = (uniquing_table->numNodes-uniquing_table->untouchableNodes-1);
 	int32_t collisions, lcopy = count, returnVal = 1;
-	uint64_t hash_multiplier = ((uniquing_table->numNodes - uniquing_table->untouchableNodes)/(uniquing_table->max_collide*2+1));
+	hash_index_t hash_multiplier = ((uniquing_table->numNodes - uniquing_table->untouchableNodes)/(uniquing_table->max_collide*2+1));
 	mach_vm_address_t *node;
 	while (--lcopy >= 0) {
         thisPC = frames[lcopy];
diff --git a/locale/xlocale.c b/locale/xlocale.c
index 17a1267..a43e1cd 100644
--- a/locale/xlocale.c
+++ b/locale/xlocale.c
@@ -112,6 +112,9 @@ _duplocale(locale_t loc)
 		loc = &__global_locale;
 	else if (loc == &__c_locale) {
 		*new = __c_locale;
+		new->__refcount = 1;
+		new->__free_extra = (__free_extra_t)_releaselocale;
+		new->__lock = LOCK_INITIALIZER;
 		return new;
 	}
 	XL_LOCK(loc);
@@ -446,10 +449,13 @@ uselocale(locale_t loc)
 			errno = EINVAL;
 			return NULL;
 		}
-		if (loc == &__global_locale)	/* should never happen */
-			loc = LC_GLOBAL_LOCALE;
+		if (loc == LC_GLOBAL_LOCALE ||
+		    loc == &__global_locale)	/* should never happen */
+			loc = NULL;
+		XL_RETAIN(loc);
 		orig = pthread_getspecific(__locale_key);
-		pthread_setspecific(__locale_key, loc == LC_GLOBAL_LOCALE ? NULL : loc);
+		pthread_setspecific(__locale_key, loc);
+		XL_RELEASE(orig);
 	}
 	return (orig ? orig : LC_GLOBAL_LOCALE);
 }
diff --git a/pthreads/pthread.c b/pthreads/pthread.c
index 0936752..5e9aefb 100644
--- a/pthreads/pthread.c
+++ b/pthreads/pthread.c
@@ -222,7 +222,7 @@ _________________________________________
 __private_extern__
 void _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), void * funarg, size_t stacksize, unsigned int flags);
 
-__private_extern__ 
+__private_extern__
 void _pthread_wqthread(pthread_t self, mach_port_t kport, void * stackaddr, pthread_workitem_t item, int reuse);
 
 #define PTHREAD_START_CUSTOM	0x01000000
@@ -836,9 +836,9 @@ _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), void * f
 	if ((pflags & PTHREAD_START_CUSTOM) == 0) {
 		stackaddr = (char *)self;
 		_pthread_struct_init(self, attrs, stackaddr,  stacksize, 1, 1);
-		#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
 		_pthread_set_self(self);
-		#endif
+#endif
 		LOCK(_pthread_list_lock);
 		if (pflags & PTHREAD_START_SETSCHED) {
 			self->policy = ((pflags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK);
@@ -850,9 +850,9 @@ _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), void * f
 			self->detached |= PTHREAD_CREATE_DETACHED;
 		}
 	}  else { 
-		#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
 		_pthread_set_self(self);
-		#endif
+#endif
 		LOCK(_pthread_list_lock);
 	}
 	self->kernel_thread = kport;
@@ -2090,9 +2090,6 @@ pthread_init(void)
 		__oldstyle = 1;
 	}
 #endif
-#if defined(__arm__)
-	__oldstyle = 1;
-#endif
 
 #if defined(_OBJC_PAGE_BASE_ADDRESS)
 {
@@ -2110,7 +2107,7 @@ pthread_init(void)
 
 	mig_init(1);		/* enable multi-threaded mig interfaces */
 	if (__oldstyle == 0) {
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
 		__bsdthread_register(thread_start, start_wqthread, round_page(sizeof(struct _pthread)), _pthread_start, &workq_targetconc[0], (__uint64_t)(&thread->tsd[__PTK_LIBDISPATCH_KEY0]) - (__uint64_t)thread);
 #else
 		__bsdthread_register(_pthread_start, _pthread_wqthread, round_page(sizeof(struct _pthread)), NULL, &workq_targetconc[0], (__uint64_t)&thread->tsd[__PTK_LIBDISPATCH_KEY0] - (__uint64_t)thread);
@@ -2493,7 +2490,7 @@ pthread_workqueue_atfork_parent(void)
 void
 pthread_workqueue_atfork_child(void)
 {
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
 	/* 
 	 * NOTE:  workq additions here  
 	 * are for i386,x86_64 only as
@@ -2517,7 +2514,7 @@ _pthread_work_internal_init(void)
 	pthread_workqueue_t wq;
 
 	if (kernel_workq_setup == 0) {
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
 		__bsdthread_register(thread_start, start_wqthread, round_page(sizeof(struct _pthread)),NULL,NULL, NULL);
 #else
 		__bsdthread_register(_pthread_start, _pthread_wqthread, round_page(sizeof(struct _pthread)),NULL,NULL, NULL);
@@ -2913,7 +2910,7 @@ _pthread_wqthread(pthread_t self, mach_port_t kport, void * stackaddr, pthread_w
 		/* These are not joinable threads */
 		self->detached &= ~PTHREAD_CREATE_JOINABLE;
 		self->detached |= PTHREAD_CREATE_DETACHED;
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) 
 		_pthread_set_self(self);
 #endif
 #if WQ_TRACE
@@ -3094,10 +3091,6 @@ pthread_workqueue_create_np(pthread_workqueue_t * workqp, const pthread_workqueu
 	pthread_workqueue_t wq;
 	pthread_workqueue_head_t headp;
 
-#if defined(__arm__)
-	/* not supported under arm */
-	return(ENOTSUP);
-#endif
 #if defined(__ppc__)
 	IF_ROSETTA() {
 		return(ENOTSUP);
diff --git a/pthreads/pthread_machdep.h b/pthreads/pthread_machdep.h
index 6ec9899..819df97 100644
--- a/pthreads/pthread_machdep.h
+++ b/pthreads/pthread_machdep.h
@@ -227,10 +227,12 @@ _pthread_getspecific_direct(unsigned long slot)
 #elif defined(__ppc64__)
         register void **__pthread_tsd asm ("r13");
         ret = __pthread_tsd[slot + (_PTHREAD_TSD_OFFSET / sizeof(void *))];
+#elif defined(__arm__) && defined(_ARM_ARCH_6) && !defined(_ARM_ARCH_7) && defined(__thumb__) && !defined(__OPTIMIZE__)
+        ret = pthread_getspecific(slot);
 #elif defined(__arm__) && defined(_ARM_ARCH_6)
-	void **__pthread_tsd;
-	__asm__ ("mrc p15, 0, %0, c13, c0, 3" : "=r"(__pthread_tsd));
-	ret = __pthread_tsd[slot + (_PTHREAD_TSD_OFFSET / sizeof(void *))];
+        void **__pthread_tsd;
+        __asm__ ("mrc p15, 0, %0, c13, c0, 3" : "=r"(__pthread_tsd));
+        ret = __pthread_tsd[slot + (_PTHREAD_TSD_OFFSET / sizeof(void *))];
 #elif defined(__arm__) && !defined(_ARM_ARCH_6)
         register void **__pthread_tsd asm ("r9");
         ret = __pthread_tsd[slot + (_PTHREAD_TSD_OFFSET / sizeof(void *))];