From: Apple Date: Fri, 22 Jan 2010 22:55:51 +0000 (+0000) Subject: Libc-594.1.4.tar.gz X-Git-Tag: mac-os-x-1063^0 X-Git-Url: https://git.saurik.com/apple/libc.git/commitdiff_plain/51282358e8fdbfc483c0c34e7eae9b89b51f2570 Libc-594.1.4.tar.gz --- diff --git a/arm/pthreads/Makefile.inc b/arm/pthreads/Makefile.inc index cc92564..4addcfe 100644 --- a/arm/pthreads/Makefile.inc +++ b/arm/pthreads/Makefile.inc @@ -4,4 +4,7 @@ MDSRCS += \ pthread_set_self.s \ pthread_self.s \ pthread_getspecific.s \ - init_cpu_capabilities.c + init_cpu_capabilities.c \ + start_wqthread.s \ + thread_start.s + diff --git a/arm/pthreads/start_wqthread.s b/arm/pthreads/start_wqthread.s new file mode 100644 index 0000000..3cf471e --- /dev/null +++ b/arm/pthreads/start_wqthread.s @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2009 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include + +#define __APPLE_API_PRIVATE +#include +#undef __APPLE_API_PRIVATE + +// This routine is never called directly by user code, jumped from kernel +// args 0 to 3 are already in the regs 0 to 3 +// should set stack with the 2 extra args before calling pthread_wqthread() +// arg4 is in r[4] +// arg5 is in r[5] + + .text + .align 2 + .globl _start_wqthread +_start_wqthread: + stmfd sp!, {r4, r5} + bl __pthread_wqthread diff --git a/arm/pthreads/thread_start.s b/arm/pthreads/thread_start.s new file mode 100644 index 0000000..e7574d6 --- /dev/null +++ b/arm/pthreads/thread_start.s @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2009 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include + +#define __APPLE_API_PRIVATE +#include +#undef __APPLE_API_PRIVATE + +// This routine is never called directly by user code, jumped from kernel +// args 0 to 3 are already in the regs 0 to 3 +// should set stack with the 2 extra args before calling pthread_wqthread() +// arg4 is in r[4] +// arg5 is in r[5] + + .text + .align 2 + .globl _thread_start +_thread_start: + stmfd sp!, {r4, r5} + bl __pthread_start diff --git a/arm/string/Makefile.inc b/arm/string/Makefile.inc index c89ffa2..73dcb7f 100644 --- a/arm/string/Makefile.inc +++ b/arm/string/Makefile.inc @@ -4,10 +4,11 @@ # .PATH: ${.CURDIR}/arm/string -MDSRCS += \ - bcopy.s \ - bzero.s \ - ffs.s \ +MDSRCS += \ + bcopy.s \ + bzero.s \ + ffs.s \ + memcmp.s \ strcmp.s \ strlen.s @@ -15,4 +16,4 @@ MDSRCS += \ MDSRCS += memset_pattern.s .endif -SUPPRESSSRCS += memcpy.c memmove.c memset.c strlen.c +SUPPRESSSRCS += bcmp.c memcpy.c memmove.c memset.c strlen.c diff --git a/arm/string/NEON/bcopy.s b/arm/string/NEON/bcopy.s new file mode 100644 index 0000000..30abab1 --- /dev/null +++ b/arm/string/NEON/bcopy.s @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/***************************************************************************** + * Cortex-A8 implementation * + *****************************************************************************/ + +// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ). +// +// Our tests have shown that NEON is always a performance win for memcpy( ). +// However, for the specific case of copies from a warm source to a cold +// destination when the buffer size is between 1k and 32k, it is not enough +// of a performance win to offset the increased power footprint, resulting +// in an energy usage regression. Thus, we detect that particular case, and +// pass those copies through the ARM core registers. All other copies larger +// than 8 bytes are handled on NEON. +// +// Stephen Canon, August 2009 + +.text +.code 16 +.syntax unified + +// void bcopy(const void * source, +// void * destination, +// size_t length); +// +// void *memmove(void * destination, +// const void * source, +// size_t n); +// +// void *memcpy(void * restrict destination, +// const void * restrict source, +// size_t n); +// +// all copy n successive bytes from source to destination. memmove and memcpy +// returns destination, whereas bcopy has no return value. copying takes place +// as if it were through a temporary buffer -- after return destination contains +// exactly the bytes from source, even if the buffers overlap. + +.thumb_func _bcopy +.globl _bcopy +.thumb_func _memmove +.globl _memmove +.thumb_func _memcpy +.globl _memcpy + +.align 2 +_bcopy: + mov r3, r0 // swap the first and second arguments + mov r0, r1 // and fall through into memmove + mov r1, r3 // + +.align 2 +_memmove: +_memcpy: + subs r3, r0, r1 // offset = destination addr - source addr + it eq + bxeq lr // if source == destination, early out + +// Our preference is for using a (faster) front-to-back copy. However, if +// 0 < offset < length, it is necessary to copy back-to-front for correctness. +// We have already ruled out offset == 0, so we can use an unsigned compare +// with length -- if offset is higher, offset is either greater than length +// or negative. + + cmp r3, r2 + bhs L_copyFrontToBack + +/***************************************************************************** + * back to front copy * + *****************************************************************************/ + + mov ip, r0 // copy destination pointer. + add r1, r2 // move source pointer to end of source array + add ip, r2 // move destination pointer to end of dest array + + subs r2, $8 // if length - 8 is negative (i.e. length + blt L_scalarReverseCopy // is less than 8), jump to cleanup path. + tst ip, $7 // if (destination + length) is doubleword + beq L_vectorReverseCopy // aligned, jump to fast path. + +0: ldrb r3, [r1, $-1]! // load byte + sub r2, $1 // decrement length + strb r3, [ip, $-1]! // store byte + tst ip, $7 // test alignment + bne 0b + + cmp r2, $0 // if length - 8 is negative, + blt L_scalarReverseCopy // jump to the cleanup code + +/***************************************************************************** + * destination is doubleword aligned * + *****************************************************************************/ + +L_vectorReverseCopy: + ands r3, r1, $3 // Extract the alignment of the source + bic r1, $3 + tbh [pc, r3, lsl $1] // Dispatch table on source alignment +0: +.short (L_reverseAligned0-0b)/2 // The NEON alignment hardware does not work +.short (L_reverseAligned1-0b)/2 // properly with sub 4-byte alignment and +.short (L_reverseAligned2-0b)/2 // buffers that are uncacheable, so we need +.short (L_reverseAligned3-0b)/2 // to have a software workaround. + +/***************************************************************************** + * source is also at least word aligned * + *****************************************************************************/ + +L_reverseAligned0: + subs r2, $0x38 // if length - 64 is negative, jump to + blt L_reverseVectorCleanup// the cleanup path. + tst ip, $0x38 // if (destination + length) is cacheline + beq L_reverseCachelineAligned // aligned, jump to the fast path. + +0: sub r1, $8 // copy eight bytes at a time until the + vld1.32 {d0}, [r1] // destination is 8 byte aligned. + sub ip, $8 // + sub r2, $8 // + tst ip, $0x38 // + vst1.64 {d0}, [ip, :64] // + bne 0b // + + cmp r2, $0 // if length - 64 is negative, + blt L_reverseVectorCleanup// jump to the cleanup code + +L_reverseCachelineAligned: + sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core + cmp r3, $0x7c00 // register copies instead of NEON to + blo L_useSTMDB // control energy usage. + + sub r1, $32 // decrement source + sub ip, $32 // decrement destination + mov r3, $-32 // load address increment + tst r1, $0x1f // if source shares 32 byte alignment + beq L_reverseSourceAligned// jump to loop with more alignment hints + + vld1.32 {q2,q3}, [r1], r3 // This loop handles 4-byte aligned copies + vld1.32 {q0,q1}, [r1], r3 // as generally as possible. + subs r2, $64 // + vst1.64 {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always + blt 1f // properly handle misalignment in vld1 +.align 3 // with an element size of 8 or 16, so +0: vld1.32 {q2,q3}, [r1], r3 // this is the best we can do without + vst1.64 {q0,q1}, [ip,:256], r3 // handling alignment in software. + vld1.32 {q0,q1}, [r1], r3 // + subs r2, $64 // + vst1.64 {q2,q3}, [ip,:256], r3 // + bge 0b // + b 1f // + +L_reverseSourceAligned: + vld1.64 {q2,q3}, [r1,:256], r3 // Identical to loop above except for + vld1.64 {q0,q1}, [r1,:256], r3 // additional alignment information; this + subs r2, $64 // gets an additional .5 bytes per cycle + vst1.64 {q2,q3}, [ip,:256], r3 // on Cortex-A8. + blt 1f // +.align 3 // +0: vld1.64 {q2,q3}, [r1,:256], r3 // + vst1.64 {q0,q1}, [ip,:256], r3 // + vld1.64 {q0,q1}, [r1,:256], r3 // + subs r2, $64 // + vst1.64 {q2,q3}, [ip,:256], r3 // + bge 0b // +1: vst1.64 {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store + add r1, $32 // point source at last element stored + add ip, $32 // point destination at last element stored + +L_reverseVectorCleanup: + adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup + blt L_scalarReverseCopy // + +0: sub r1, $8 // copy eight bytes at a time until + vld1.32 {d0}, [r1] // (length - 8) < 0. + sub ip, $8 // + subs r2, $8 // + vst1.64 {d0}, [ip, :64] // + bge 0b // + +/***************************************************************************** + * sub-doubleword cleanup copies * + *****************************************************************************/ + +L_scalarReverseCopy: + adds r2, #0x8 // restore length + it eq // if this is zero + bxeq lr // early out + +0: ldrb r3, [r1, #-1]! // load a byte from source + strb r3, [ip, #-1]! // store to destination + subs r2, #0x1 // subtract one from length + bne 0b // if non-zero, repeat + bx lr // return + +/***************************************************************************** + * STMDB loop for 1k-32k buffers * + *****************************************************************************/ + +L_useSTMDB: + push {r4-r8,r10,r11} +.align 3 +0: ldmdb r1!, {r3-r8,r10,r11} + subs r2, #0x40 + stmdb ip!, {r3-r8,r10,r11} + ldmdb r1!, {r3-r8,r10,r11} + pld [r1, #-0x40] + stmdb ip!, {r3-r8,r10,r11} + bge 0b + pop {r4-r8,r10,r11} + b L_reverseVectorCleanup + +/***************************************************************************** + * Misaligned vld1 loop * + *****************************************************************************/ + +// Software alignment fixup to handle source and dest that are relatively +// misaligned mod 4 bytes. Load two 4-byte aligned double words from source, +// use vext.8 to extract a double word to store, and perform an 8-byte aligned +// store to destination. + +#define RCOPY_UNALIGNED(offset) \ + subs r2, $8 ;\ + blt 2f ;\ + sub r1, $8 ;\ + sub ip, $8 ;\ + mov r3, $-8 ;\ + vld1.32 {d2,d3}, [r1], r3 ;\ + subs r2, $8 ;\ + blt 1f ;\ +0: vext.8 d0, d2, d3, $(offset);\ + vmov d3, d2 ;\ + vld1.32 {d2}, [r1], r3 ;\ + subs r2, $8 ;\ + vst1.64 {d0}, [ip, :64], r3 ;\ + bge 0b ;\ +1: vext.8 d0, d2, d3, $(offset);\ + add r1, $8 ;\ + vst1.64 {d0}, [ip, :64] ;\ +2: add r2, $8 ;\ + add r1, $(offset);\ + b L_scalarReverseCopy + +L_reverseAligned1: + RCOPY_UNALIGNED(1) +L_reverseAligned2: + RCOPY_UNALIGNED(2) +L_reverseAligned3: + RCOPY_UNALIGNED(3) + +/***************************************************************************** + * front to back copy * + *****************************************************************************/ + +L_copyFrontToBack: + mov ip, r0 // copy destination pointer. + subs r2, $8 // if length - 8 is negative (i.e. length + blt L_scalarCopy // is less than 8), jump to cleanup path. + tst ip, $7 // if the destination is doubleword + beq L_vectorCopy // aligned, jump to fast path. + +0: ldrb r3, [r1], $1 // load byte + sub r2, $1 // decrement length + strb r3, [ip], $1 // store byte + tst ip, $7 // test alignment + bne 0b + + cmp r2, $0 // if length - 8 is negative, + blt L_scalarCopy // jump to the cleanup code + +/***************************************************************************** + * destination is doubleword aligned * + *****************************************************************************/ + +L_vectorCopy: + ands r3, r1, $3 // Extract the alignment of the source + bic r1, $3 + tbh [pc, r3, lsl $1] // Dispatch table on source alignment +0: +.short (L_sourceAligned0-0b)/2 // The NEON alignment hardware does not work +.short (L_sourceAligned1-0b)/2 // properly with sub 4-byte alignment and +.short (L_sourceAligned2-0b)/2 // buffers that are uncacheable, so we need +.short (L_sourceAligned3-0b)/2 // to have a software workaround. + +/***************************************************************************** + * source is also at least word aligned * + *****************************************************************************/ + +L_sourceAligned0: + subs r2, $0x38 // If (length - 64) < 0 + blt L_vectorCleanup // jump to cleanup code + tst ip, $0x38 // If destination is 64 byte aligned + beq L_cachelineAligned // jump to main loop + +0: vld1.32 {d0}, [r1]! // Copy one double word at a time until + sub r2, $8 // the destination is 64-byte aligned. + vst1.64 {d0}, [ip, :64]! // + tst ip, $0x38 // + bne 0b // + + cmp r2, $0 // If (length - 64) < 0, goto cleanup + blt L_vectorCleanup // + +L_cachelineAligned: + sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core + cmp r3, $0x7c00 // register copies instead of NEON to + blo L_useSTMIA // control energy usage. + tst r1, $0x1f // If source has 32-byte alignment, use + beq L_sourceAligned32 // an optimized loop. + + vld1.32 {q2,q3}, [r1]! // This is the most common path for small + vld1.32 {q0,q1}, [r1]! // copies, which are alarmingly frequent. + subs r2, #0x40 // It requires 4-byte alignment on the + vst1.64 {q2,q3}, [ip, :256]! // source. For ordinary malloc'd buffers, + blt 1f // this path could handle only single-byte +.align 3 // alignment at speed by using vld1.8 +0: vld1.32 {q2,q3}, [r1]! // instead of vld1.32; however, the NEON + vst1.64 {q0,q1}, [ip, :256]! // alignment handler misbehaves for some + vld1.32 {q0,q1}, [r1]! // special copies if the element size is + subs r2, #0x40 // 8 or 16, so we need to work around + vst1.64 {q2,q3}, [ip, :256]! // sub 4-byte alignment in software, in + bge 0b // another code path. + b 1f + +L_sourceAligned32: + vld1.64 {q2,q3}, [r1, :256]! // When the source shares 32-byte alignment + vld1.64 {q0,q1}, [r1, :256]! // with the destination, we use this loop + subs r2, #0x40 // instead, which specifies the maximum + vst1.64 {q2,q3}, [ip, :256]! // :256 alignment on all loads and stores. + blt 1f // +.align 3 // This gets an additional .5 bytes per +0: vld1.64 {q2,q3}, [r1, :256]! // cycle for in-cache copies, which is not + vst1.64 {q0,q1}, [ip, :256]! // insignificant for this (rather common) + vld1.64 {q0,q1}, [r1, :256]! // case. + subs r2, #0x40 // + vst1.64 {q2,q3}, [ip, :256]! // This is identical to the above loop, + bge 0b // except for the additional alignment. +1: vst1.64 {q0,q1}, [ip, :256]! // + +L_vectorCleanup: + adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup + blt L_scalarCopy // + +0: vld1.32 {d0}, [r1]! // Copy one doubleword at a time until + subs r2, $8 // (length - 8) < 0. + vst1.64 {d0}, [ip, :64]! // + bge 0b // + +/***************************************************************************** + * sub-doubleword cleanup copies * + *****************************************************************************/ + +L_scalarCopy: + adds r2, #0x8 // restore length + it eq // if this is zero + bxeq lr // early out + +0: ldrb r3, [r1], #1 // load a byte from source + strb r3, [ip], #1 // store to destination + subs r2, #1 // subtract one from length + bne 0b // if non-zero, repeat + bx lr // return + +/***************************************************************************** + * STMIA loop for 1k-32k buffers * + *****************************************************************************/ + +L_useSTMIA: + push {r4-r8,r10,r11} +.align 3 +0: ldmia r1!, {r3-r8,r10,r11} + subs r2, r2, #64 + stmia ip!, {r3-r8,r10,r11} + ldmia r1!, {r3-r8,r10,r11} + pld [r1, #64] + stmia ip!, {r3-r8,r10,r11} + bge 0b + pop {r4-r8,r10,r11} + b L_vectorCleanup + +/***************************************************************************** + * Misaligned reverse vld1 loop * + *****************************************************************************/ + +// Software alignment fixup to handle source and dest that are relatively +// misaligned mod 4 bytes. Load two 4-byte aligned double words from source, +// use vext.8 to extract a double word to store, and perform an 8-byte aligned +// store to destination. + +#define COPY_UNALIGNED(offset) \ + subs r2, $8 ;\ + blt 2f ;\ + vld1.32 {d2,d3}, [r1]! ;\ + subs r2, $8 ;\ + blt 1f ;\ +0: vext.8 d0, d2, d3, $(offset);\ + vmov d2, d3 ;\ + vld1.32 {d3}, [r1]! ;\ + subs r2, $8 ;\ + vst1.64 {d0}, [ip, :64]! ;\ + bge 0b ;\ +1: vext.8 d0, d2, d3, $(offset);\ + sub r1, $8 ;\ + vst1.64 {d0}, [ip, :64]! ;\ +2: add r1, $(offset);\ + add r2, $8 ;\ + b L_scalarCopy + +L_sourceAligned1: + COPY_UNALIGNED(1) +L_sourceAligned2: + COPY_UNALIGNED(2) +L_sourceAligned3: + COPY_UNALIGNED(3) diff --git a/arm/string/NEON/bzero.s b/arm/string/NEON/bzero.s new file mode 100644 index 0000000..50b1c8e --- /dev/null +++ b/arm/string/NEON/bzero.s @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/********************************************************************** + * Cortex-A8 implementation * + **********************************************************************/ + +// Cortex-A8 implementations of memset( ) and bzero( ). Main loop is 64-byte +// NEON stores, unless the buffer length is > 1k. Beyond that point, there is +// little to no speed advantage with NEON (and a slight regression in some +// measured cases), so we switch to the GPRs. +// +// The crossover point should be reevaluated for future architectures. +// +// -- Stephen Canon, August 2009 + +.text +.syntax unified +.code 16 + +// void bzero(void * destination, +// size_t length); +// +// zeros out a buffer length bytes long, beginning at the address destination. +.thumb_func ___bzero +.globl ___bzero +.thumb_func _bzero +.globl _bzero +.align 2 +___bzero: +_bzero: + mov r2, r1 // match the API to memset(dest, 0, length) + eor r1, r1 // and fall through into memset + +// void *memset(void * destination, +// int value, size_t n); +// +// writes value converted to an unsigned char to n successive bytes, beginning +// at destination. + +// Notes on register usage: +// +// Throughout this function, registers have nearly constant usage; the pattern +// is: +// +// r0 holds the original destination pointer, unmodified. This value +// must be returned by the routine, so it is easiest to just leave it +// in place. +// r1 holds the value that is being copied into the buffer, in some stage +// of splattedness. The low byte is guaranteed to always have the value +// but the higher bytes may or may not contain copies of it. +// r2 holds the length minus some offset, where the offset is always the +// number of bytes that the current loop stores per iteration. +// r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted +// copies of the value to be stored. +// ip holds a pointer to the lowest byte in the array that has not yet been +// set to hold value. +// q0 and q1 hold splatted copies of the value in the vector path, and are +// otherwise unused. + +.thumb_func _memset +.globl _memset +.align 2 +_memset: + mov ip, r0 // copy destination pointer. + subs r2, #0x8 // if length - 8 is negative (i.e. length + and r1, #0xff // is less than 8), jump to cleanup path. + blt L_scalarCleanup // + + tst ip, #0x7 // if the destination is doubleword + beq L_vectorCopy // aligned, jump to fast path. + +0: strb r1, [ip], #1 // store one byte at a time until + sub r2, #1 // destination pointer is 8 byte aligned. + tst ip, #7 // + bne 0b // + + cmp r2, #0x0 // if length - 8 is negative, + blt L_scalarCleanup // jump to the cleanup code + +L_vectorCopy: + vdup.8 q0, r1 // splat the byte to be stored across + subs r2, #0x38 // q0 and q1, and check if length - 64 + vmov q1, q0 // is negative; if so, jump to the + blt L_vectorCleanup // cleanup code. + + tst ip, #0x38 // if the destination is cacheline + beq L_cachelineAligned // aligned, jump to the fast path. + +0: vst1.64 {d0}, [ip, :64]! // store one double word at a time until + sub r2, #8 // the destination is 64-byte aligned + tst ip, #0x38 // + bne 0b + + cmp r2, #0x0 // if length - 64 is negative, + blt L_vectorCleanup // jump to the cleanup code + +L_cachelineAligned: + cmp r2, #0x3c0 // if length > 1024 + bge L_useSTMIA // we use stmia instead + +.align 4 // main loop +0: vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes + subs r2, #0x40 // decrement length by 64 + vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes + bge 0b // if length - 64 >= 0, continue + +L_vectorCleanup: + adds r2, #0x38 // if (length - 8) < 0, goto scalar cleanup + blt L_scalarCleanup // + +0: subs r2, #8 // store one double word at a time until + vst1.64 {d0}, [ip, :64]! // (length - 8) < 0. + bge 0b + +L_scalarCleanup: + adds r2, #8 // restore length + beq 1f // early out if zero. + +0: strb r1, [ip], #1 // store one byte at a time until length + subs r2, #1 // is zero. + bne 0b // +1: bx lr // return. + +// STMIA loop for large buffers +// +// For stores larger than 1024 bytes, we use STMIA because we can't get enough +// of a speedup from NEON to offset the higher power draw of the NEON unit. +// +// This crossover should be reevaluated on future architectures. +// +// We avoid using r7 and r9 even though it's not strictly necessary. + +L_useSTMIA: + push {r4,r5,r6,r8,r10,r11} + orr r1, r1, r1, lsl #8 + orr r1, r1, r1, lsl #16 + mov r3, r1 + mov r4, r1 + mov r5, r1 + mov r6, r1 + mov r8, r1 + mov r10, r1 + mov r11, r1 +.align 4 +0: stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11} + subs r2, #0x40 + stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11} + bge 0b + pop {r4,r5,r6,r8,r10,r11} + b L_vectorCleanup diff --git a/arm/string/bcopy.s b/arm/string/bcopy.s index da24152..2e67e1c 100644 --- a/arm/string/bcopy.s +++ b/arm/string/bcopy.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2006, 2009 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -21,6 +21,19 @@ * @APPLE_LICENSE_HEADER_END@ */ +#if defined __thumb2__ && defined __ARM_NEON__ + +// Use our tuned NEON implementation when it is available. Otherwise fall back +// on more generic ARM code. + +#include "NEON/bcopy.s" + +#else // defined __thumb2__ && defined __ARM_NEON__ + +/***************************************************************************** + * ARMv5 and ARMv6 implementation * + *****************************************************************************/ + #include .text @@ -398,4 +411,5 @@ Lalign3_forward_loop: Lexit: ldmfd sp!, {r0, r4, r5, r7, pc} +#endif // defined __thumb2__ && defined __ARM_NEON__ diff --git a/arm/string/bzero.s b/arm/string/bzero.s index ada3727..e3a3a8d 100644 --- a/arm/string/bzero.s +++ b/arm/string/bzero.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2006, 2009 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -20,6 +20,15 @@ * * @APPLE_LICENSE_HEADER_END@ */ + +#if defined __thumb2__ && defined __ARM_NEON__ + +// Use our tuned NEON implementation when it is available. Otherwise fall back +// on more generic ARM code. + +#include "NEON/bzero.s" + +#else // defined __thumb2__ && defined __ARM_NEON__ #include #include @@ -160,3 +169,5 @@ L_unaligned: b L_lessthan64aligned X_LEAF(___bzero, _bzero) + +#endif // defined __thumb2__ && defined __ARM_NEON__ diff --git a/arm/string/memcmp.s b/arm/string/memcmp.s new file mode 100644 index 0000000..83e0f87 --- /dev/null +++ b/arm/string/memcmp.s @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +// ARM Assembly implementation of memcmp( ) from +// Uses Thumb2 if it is available, otherwise generates ARM code. +// +// -- Stephen Canon, August 2009 +// +// The basic idea is to use word compares instead of byte compares as long as +// at least four bytes remain to be compared. However, because memcmp( ) +// compares the buffers as though they were big-endian unsigned integers, we +// need to byte-reverse each word before comparing them. +// +// If the buffers are not word aligned, or they are shorter than four bytes, +// we just use a simple byte comparison loop instead. +// +// int bcmp(void *src1, void *src2, size_t length); +// int memcmp(void *src1, void *src2, size_t length); + +#include + + .text + .syntax unified +#if defined __thumb2__ + .code 16 + .thumb_func _bcmp + .thumb_func _memcmp +#else + .code 32 +#endif + .globl _bcmp + .globl _memcmp + .align 3 +_bcmp: +_memcmp: + +#ifdef _ARM_ARCH_6 + subs ip, r2, #4 // if length < 4 + bmi L_useByteCompares // jump to the byte comparison loop + + orr r3, r0, r1 // if the buffers are + tst r3, #3 // not word aligned + bne L_useByteCompares // jump to the byte comparison loop + +.align 3 +L_wordCompare: // Here we know that both buffers are word + ldr r2, [r0], #4 // aligned, and (length - 4) > 0, so at least + ldr r3, [r1], #4 // four bytes remain to be compared. We load + subs ip, #4 // a word from each buffer, and byte reverse + bmi L_lastWord // the loaded words. We also decrement the + rev r2, r2 // length by four and jump out of this loop if + rev r3, r3 // the result is negative. Then we compare the + cmp r2, r3 // reversed words, and continue the loop only + beq L_wordCompare // if they are equal. +L_wordsUnequal: + ite hi // If the words compared unequal, return +/- 1 + movhi r0, #1 // according to the result of the comparison. + movls r0, #-1 // + bx lr // +L_lastWord: + rev r2, r2 // If we just loaded the last complete words + rev r3, r3 // from the buffers, byte-reverse them and + cmp r2, r3 // compare. If they are unequal, jump to the + bne L_wordsUnequal // return path. + add r2, ip, #4 // Otherwise, fall into the cleanup code. +#endif // _ARM_ARCH_6 + +L_useByteCompares: + tst r2, r2 // If the length is exactly zero + beq L_returnZero // avoid doing any loads and return zero. + mov r3, r0 +.align 3 +L_byteCompareLoop: + ldrb r0, [r3], #1 // Load a byte from each buffer, and decrement + ldrb ip, [r1], #1 // the length by one. If the decremented + subs r2, #1 // length is zero, exit the loop. Otherwise + beq L_lastByte // subtract the loaded bytes; if their + subs r0, ip // difference is zero, continue the comparison + beq L_byteCompareLoop // loop. Otherwise, return their difference. + bx lr +L_returnZero: + mov r0, ip +L_lastByte: + sub r0, ip // Return the difference of the final bytes + bx lr diff --git a/arm/sys/OSAtomic-v4.c b/arm/sys/OSAtomic-v4.c index c725cb4..723d84f 100644 --- a/arm/sys/OSAtomic-v4.c +++ b/arm/sys/OSAtomic-v4.c @@ -187,6 +187,30 @@ bool OSAtomicCompareAndSwap32Barrier( int32_t oldValue, int32_t newValue, vol return OSAtomicCompareAndSwap32(oldValue, newValue, theValue); } +bool +OSAtomicCompareAndSwapInt(int oldValue, int newValue, volatile int *theValue) +{ + return OSAtomicCompareAndSwap32(oldValue, newValue, theValue); +} + +bool +OSAtomicCompareAndSwapIntBarrier(int oldValue, int newValue, volatile int *theValue) +{ + return OSAtomicCompareAndSwap32(oldValue, newValue, theValue); +} + +bool +OSAtomicCompareAndSwapLong(long oldValue, long newValue, volatile long *theValue) +{ + return OSAtomicCompareAndSwap32(oldValue, newValue, (volatile int32_t *)theValue); +} + +bool +OSAtomicCompareAndSwapLongBarrier(long oldValue, long newValue, volatile long *theValue) +{ + return OSAtomicCompareAndSwap32(oldValue, newValue, (volatile int32_t *)theValue); +} + bool OSAtomicCompareAndSwap64( int64_t oldValue, int64_t newValue, volatile int64_t *theValue ) { bool result; diff --git a/gdtoa/FreeBSD/gdtoa-misc.c.patch b/gdtoa/FreeBSD/gdtoa-misc.c.patch index 8974654..261ec59 100644 --- a/gdtoa/FreeBSD/gdtoa-misc.c.patch +++ b/gdtoa/FreeBSD/gdtoa-misc.c.patch @@ -1,5 +1,5 @@ ---- gdtoa-misc.c.orig 2008-11-05 15:59:34.000000000 -0800 -+++ gdtoa-misc.c 2008-11-05 16:05:28.000000000 -0800 +--- gdtoa-misc.c.orig 2010-01-07 22:03:21.000000000 -0800 ++++ gdtoa-misc.c 2010-01-07 22:25:33.000000000 -0800 @@ -29,9 +29,20 @@ THIS SOFTWARE. /* Please send bug reports to David M. Gay (dmg at acm dot org, * with " at " changed at "@" and " dot " changed to "."). */ @@ -48,7 +48,7 @@ Bigint * Balloc #ifdef KR_headers -@@ -53,8 +84,25 @@ Balloc +@@ -53,9 +84,26 @@ Balloc #ifndef Omit_Private_Memory unsigned int len; #endif @@ -70,10 +70,21 @@ + } +#else /* !GDTOA_TSD */ ACQUIRE_DTOA_LOCK(0); +- if ( (rv = freelist[k]) !=0) { +#endif /* GDTOA_TSD */ - if ( (rv = freelist[k]) !=0) { ++ if (k <= Kmax && (rv = freelist[k]) !=0) { freelist[k] = rv->next; } + else { +@@ -65,7 +113,7 @@ Balloc + #else + len = (sizeof(Bigint) + (x-1)*sizeof(ULong) + sizeof(double) - 1) + /sizeof(double); +- if (pmem_next - private_mem + len <= PRIVATE_mem) { ++ if (k <= Kmax && pmem_next - private_mem + len <= PRIVATE_mem) { + rv = (Bigint*)pmem_next; + pmem_next += len; + } @@ -75,7 +123,9 @@ Balloc rv->k = k; rv->maxwds = x; @@ -84,20 +95,28 @@ rv->sign = rv->wds = 0; return rv; } -@@ -89,10 +139,16 @@ Bfree +@@ -89,10 +139,20 @@ Bfree #endif { if (v) { +- ACQUIRE_DTOA_LOCK(0); +- v->next = freelist[v->k]; +- freelist[v->k] = v; +- FREE_DTOA_LOCK(0); ++ if (v->k > Kmax) ++ free((void*)v); ++ else { +#ifdef GDTOA_TSD -+ Bigint **freelist = (Bigint **)pthread_getspecific(gdtoa_tsd_key); ++ Bigint **freelist = (Bigint **)pthread_getspecific(gdtoa_tsd_key); +#else /* !GDTOA_TSD */ - ACQUIRE_DTOA_LOCK(0); ++ ACQUIRE_DTOA_LOCK(0); +#endif /* GDTOA_TSD */ - v->next = freelist[v->k]; - freelist[v->k] = v; ++ v->next = freelist[v->k]; ++ freelist[v->k] = v; +#ifndef GDTOA_TSD - FREE_DTOA_LOCK(0); ++ FREE_DTOA_LOCK(0); +#endif /* GDTOA_TSD */ ++ } } } diff --git a/gdtoa/gdtoa-misc-fbsd.c b/gdtoa/gdtoa-misc-fbsd.c index 8540a0c..659f69c 100644 --- a/gdtoa/gdtoa-misc-fbsd.c +++ b/gdtoa/gdtoa-misc-fbsd.c @@ -103,7 +103,7 @@ Balloc #else /* !GDTOA_TSD */ ACQUIRE_DTOA_LOCK(0); #endif /* GDTOA_TSD */ - if ( (rv = freelist[k]) !=0) { + if (k <= Kmax && (rv = freelist[k]) !=0) { freelist[k] = rv->next; } else { @@ -113,7 +113,7 @@ Balloc #else len = (sizeof(Bigint) + (x-1)*sizeof(ULong) + sizeof(double) - 1) /sizeof(double); - if (pmem_next - private_mem + len <= PRIVATE_mem) { + if (k <= Kmax && pmem_next - private_mem + len <= PRIVATE_mem) { rv = (Bigint*)pmem_next; pmem_next += len; } @@ -139,16 +139,20 @@ Bfree #endif { if (v) { + if (v->k > Kmax) + free((void*)v); + else { #ifdef GDTOA_TSD - Bigint **freelist = (Bigint **)pthread_getspecific(gdtoa_tsd_key); + Bigint **freelist = (Bigint **)pthread_getspecific(gdtoa_tsd_key); #else /* !GDTOA_TSD */ - ACQUIRE_DTOA_LOCK(0); + ACQUIRE_DTOA_LOCK(0); #endif /* GDTOA_TSD */ - v->next = freelist[v->k]; - freelist[v->k] = v; + v->next = freelist[v->k]; + freelist[v->k] = v; #ifndef GDTOA_TSD - FREE_DTOA_LOCK(0); + FREE_DTOA_LOCK(0); #endif /* GDTOA_TSD */ + } } } diff --git a/gen/asl.c b/gen/asl.c index 5a18acd..8f1f34d 100644 --- a/gen/asl.c +++ b/gen/asl.c @@ -84,6 +84,7 @@ time_t asl_parse_time(const char *); const char *asl_syslog_faciliy_num_to_name(int n); __private_extern__ asl_client_t *_asl_open_default(); +__private_extern__ int _asl_send_level_message(aslclient ac, aslmsg msg, int level, const char *message); /* notify SPI */ uint32_t notify_register_plain(const char *name, int *out_token); @@ -2312,17 +2313,6 @@ asl_vlog(aslclient ac, aslmsg a, int level, const char *format, va_list ap) if (level < ASL_LEVEL_EMERG) level = ASL_LEVEL_EMERG; if (level > ASL_LEVEL_DEBUG) level = ASL_LEVEL_DEBUG; - str = NULL; - asprintf(&str, "%d", level); - if (str == NULL) - { - if ((msg != NULL) && (my_msg != 0)) asl_free(msg); - return -1; - } - - asl_set(msg, ASL_KEY_LEVEL, str); - free(str); - /* insert strerror for %m */ len = 0; elen = 0; @@ -2409,11 +2399,9 @@ asl_vlog(aslclient ac, aslmsg a, int level, const char *format, va_list ap) return -1; } - asl_set(msg, ASL_KEY_MSG, str); + status = _asl_send_level_message(ac, (aslmsg)msg, level, str); free(str); - status = asl_send(ac, (aslmsg)msg); - if ((msg != NULL) && (my_msg != 0)) asl_free(msg); return status; } @@ -2725,18 +2713,17 @@ asl_format_message(aslmsg msg, const char *mfmt, const char *tfmt, uint32_t text } /* - * asl_send: send a message + * asl_send (internal version): send a message * This routine may be used instead of asl_log() or asl_vlog() if asl_set() * has been used to set all of a message's attributes. - * msg: an aslmsg * returns 0 for success, non-zero for failure */ -int -asl_send(aslclient ac, aslmsg msg) +__private_extern__ int +_asl_send_level_message(aslclient ac, aslmsg msg, int level, const char *message) { char *str, *out_raw; caddr_t out; - uint32_t i, len, outlen, level, lmask, outstatus, filter, check, senderx, facilityx; + uint32_t i, len, outlen, lmask, outstatus, filter, check, senderx, facilityx; uint64_t v64; const char *val; char *name, *x; @@ -2745,7 +2732,7 @@ asl_send(aslclient ac, aslmsg msg) int status, rc_filter; asl_client_t *asl; int use_global_lock; - asl_msg_t *mt; + asl_msg_t *mt, *tmp_msg; char hname[_POSIX_HOST_NAME_MAX]; kern_return_t kstatus; @@ -2760,8 +2747,6 @@ asl_send(aslclient ac, aslmsg msg) if (msg == NULL) return 0; - level = ASL_LEVEL_DEBUG; - val = asl_get(msg, ASL_KEY_LEVEL); if (val != NULL) level = atoi(val); @@ -2814,6 +2799,26 @@ asl_send(aslclient ac, aslmsg msg) rc_filter = 1; } + /* + * Copy the message to tmp_msg to make setting values thread-safe + */ + tmp_msg = calloc(1, sizeof(asl_msg_t)); + if (tmp_msg == NULL) return -1; + + tmp_msg->type = ASL_TYPE_MSG; + + mt = (asl_msg_t *)msg; + for (i = 0; i < mt->count; i++) + { + asl_set(tmp_msg, mt->key[i], mt->val[i]); + } + + /* + * Set Level and Message from parameters. + */ + if (message != NULL) asl_set(tmp_msg, ASL_KEY_MSG, message); + asl_set(tmp_msg, ASL_KEY_LEVEL, _asl_level_string(level)); + /* * Time, TimeNanoSec, Host, PID, UID, and GID values get set here */ @@ -2826,7 +2831,7 @@ asl_send(aslclient ac, aslmsg msg) asprintf(&str, "%lu", tval.tv_sec); if (str != NULL) { - asl_set(msg, ASL_KEY_TIME, str); + asl_set(tmp_msg, ASL_KEY_TIME, str); free(str); str = NULL; } @@ -2834,7 +2839,7 @@ asl_send(aslclient ac, aslmsg msg) asprintf(&str, "%lu", tval.tv_usec * 1000); if (str != NULL) { - asl_set(msg, ASL_KEY_TIME_NSEC, str); + asl_set(tmp_msg, ASL_KEY_TIME_NSEC, str); free(str); str = NULL; } @@ -2845,7 +2850,7 @@ asl_send(aslclient ac, aslmsg msg) asprintf(&str, "%lu", tick); if (str != NULL) { - asl_set(msg, ASL_KEY_TIME, str); + asl_set(tmp_msg, ASL_KEY_TIME, str); free(str); str = NULL; } @@ -2854,14 +2859,14 @@ asl_send(aslclient ac, aslmsg msg) memset(&hname, 0, _POSIX_HOST_NAME_MAX); if (gethostname(hname, _POSIX_HOST_NAME_MAX) == 0) { - asl_set(msg, ASL_KEY_HOST, hname); + asl_set(tmp_msg, ASL_KEY_HOST, hname); } str = NULL; asprintf(&str, "%u", getpid()); if (str != NULL) { - asl_set(msg, ASL_KEY_PID, str); + asl_set(tmp_msg, ASL_KEY_PID, str); free(str); } @@ -2869,7 +2874,7 @@ asl_send(aslclient ac, aslmsg msg) asprintf(&str, "%d", getuid()); if (str != NULL) { - asl_set(msg, ASL_KEY_UID, str); + asl_set(tmp_msg, ASL_KEY_UID, str); free(str); } @@ -2877,30 +2882,29 @@ asl_send(aslclient ac, aslmsg msg) asprintf(&str, "%u", getgid()); if (str != NULL) { - asl_set(msg, ASL_KEY_GID, str); + asl_set(tmp_msg, ASL_KEY_GID, str); free(str); } senderx = (uint32_t)-1; facilityx = (uint32_t)-1; - mt = (asl_msg_t *)msg; - for (i = 0; (i < mt->count) && ((senderx == (uint32_t)-1) || (facilityx == (uint32_t)-1)); i++) + for (i = 0; (i < tmp_msg->count) && ((senderx == (uint32_t)-1) || (facilityx == (uint32_t)-1)); i++) { - if (mt->key[i] == NULL) continue; - if (streq(mt->key[i], ASL_KEY_SENDER)) senderx = i; - else if (streq(mt->key[i], ASL_KEY_FACILITY)) facilityx = i; + if (tmp_msg->key[i] == NULL) continue; + if (streq(tmp_msg->key[i], ASL_KEY_SENDER)) senderx = i; + else if (streq(tmp_msg->key[i], ASL_KEY_FACILITY)) facilityx = i; } /* * Set Sender if needed */ - if ((senderx == (uint32_t)-1) || (mt->val[senderx] == NULL)) + if ((senderx == (uint32_t)-1) || (tmp_msg->val[senderx] == NULL)) { if ((ac != NULL) && (ac->name != NULL)) { /* Use the Sender name from the client handle */ - asl_set(msg, ASL_KEY_SENDER, ac->name); + asl_set(tmp_msg, ASL_KEY_SENDER, ac->name); } else { @@ -2921,20 +2925,20 @@ asl_send(aslclient ac, aslmsg msg) } } - if (_asl_global.sender != NULL) asl_set(msg, ASL_KEY_SENDER, _asl_global.sender); - else asl_set(msg, ASL_KEY_SENDER, "Unknown"); + if (_asl_global.sender != NULL) asl_set(tmp_msg, ASL_KEY_SENDER, _asl_global.sender); + else asl_set(tmp_msg, ASL_KEY_SENDER, "Unknown"); } } /* * Set Facility */ - if ((facilityx == (uint32_t)-1) || (mt->val[facilityx] == NULL)) + if ((facilityx == (uint32_t)-1) || (tmp_msg->val[facilityx] == NULL)) { if ((ac != NULL) && (ac->facility != NULL)) { /* Use the Facility name from the client handle */ - asl_set(msg, ASL_KEY_FACILITY, ac->facility); + asl_set(tmp_msg, ASL_KEY_FACILITY, ac->facility); } } @@ -2944,7 +2948,7 @@ asl_send(aslclient ac, aslmsg msg) val = asl_get(msg, ASL_KEY_OPTION); if (val == NULL) { - asl_set(msg, ASL_KEY_OPTION, ASL_OPT_STORE); + asl_set(tmp_msg, ASL_KEY_OPTION, ASL_OPT_STORE); } else { @@ -2952,7 +2956,7 @@ asl_send(aslclient ac, aslmsg msg) asprintf(&str, "%s %s", ASL_OPT_STORE, val); if (str != NULL) { - asl_set(msg, ASL_KEY_OPTION, str); + asl_set(tmp_msg, ASL_KEY_OPTION, str); free(str); str = NULL; } @@ -2966,7 +2970,7 @@ asl_send(aslclient ac, aslmsg msg) if ((filter != 0) && ((filter & lmask) != 0)) { len = 0; - out_raw = asl_msg_to_string((asl_msg_t *)msg, &len); + out_raw = asl_msg_to_string(tmp_msg, &len); if ((out_raw != NULL) && (len != 0)) { @@ -3011,7 +3015,7 @@ asl_send(aslclient ac, aslmsg msg) if (asl->fd_list[i] < 0) continue; len = 0; - out = asl_format_message(msg, asl->fd_mfmt[i], asl->fd_tfmt[i], asl->fd_encoding[i], &len); + out = asl_format_message(tmp_msg, asl->fd_mfmt[i], asl->fd_tfmt[i], asl->fd_encoding[i], &len); if (out == NULL) continue; status = write(asl->fd_list[i], out, len - 1); @@ -3024,11 +3028,23 @@ asl_send(aslclient ac, aslmsg msg) free(out); } + asl_free((aslmsg)tmp_msg); + if (use_global_lock != 0) pthread_mutex_unlock(&_asl_global.lock); return outstatus; } +/* + * asl_send: send a message + * returns 0 for success, non-zero for failure + */ +int +asl_send(aslclient ac, aslmsg msg) +{ + return _asl_send_level_message(ac, msg, ASL_LEVEL_DEBUG, NULL); +} + char * asl_msg_string(aslmsg a) { diff --git a/gen/magazine_malloc.c b/gen/magazine_malloc.c index 402510c..a1fb6f0 100644 --- a/gen/magazine_malloc.c +++ b/gen/magazine_malloc.c @@ -1061,6 +1061,14 @@ mag_get_thread_index(szone_t *szone) return CPU_NUMBER() & (TINY_MAX_MAGAZINES - 1); } +#elif defined(__arm__) + +static INLINE mag_index_t +mag_get_thread_index(szone_t *szone) +{ + return 0; +} + #else #warning deriving magazine index from pthread_self() [want processor number] diff --git a/gen/stack_logging_disk.c b/gen/stack_logging_disk.c index 83c882f..aa0bf28 100644 --- a/gen/stack_logging_disk.c +++ b/gen/stack_logging_disk.c @@ -285,10 +285,13 @@ __expand_uniquing_table(backtrace_uniquing_table *uniquing_table) static int __enter_frames_in_table(backtrace_uniquing_table *uniquing_table, uint64_t *foundIndex, mach_vm_address_t *frames, int32_t count) { + // The hash values need to be the same size as the addresses (because we use the value -1), for clarity, define a new type + typedef mach_vm_address_t hash_index_t; + mach_vm_address_t thisPC; - uint64_t hash, uParent = (uint64_t)(-1ll), modulus = (uniquing_table->numNodes-uniquing_table->untouchableNodes-1); + hash_index_t hash, uParent = (hash_index_t)(-1ll), modulus = (uniquing_table->numNodes-uniquing_table->untouchableNodes-1); int32_t collisions, lcopy = count, returnVal = 1; - uint64_t hash_multiplier = ((uniquing_table->numNodes - uniquing_table->untouchableNodes)/(uniquing_table->max_collide*2+1)); + hash_index_t hash_multiplier = ((uniquing_table->numNodes - uniquing_table->untouchableNodes)/(uniquing_table->max_collide*2+1)); mach_vm_address_t *node; while (--lcopy >= 0) { thisPC = frames[lcopy]; diff --git a/locale/xlocale.c b/locale/xlocale.c index 17a1267..a43e1cd 100644 --- a/locale/xlocale.c +++ b/locale/xlocale.c @@ -112,6 +112,9 @@ _duplocale(locale_t loc) loc = &__global_locale; else if (loc == &__c_locale) { *new = __c_locale; + new->__refcount = 1; + new->__free_extra = (__free_extra_t)_releaselocale; + new->__lock = LOCK_INITIALIZER; return new; } XL_LOCK(loc); @@ -446,10 +449,13 @@ uselocale(locale_t loc) errno = EINVAL; return NULL; } - if (loc == &__global_locale) /* should never happen */ - loc = LC_GLOBAL_LOCALE; + if (loc == LC_GLOBAL_LOCALE || + loc == &__global_locale) /* should never happen */ + loc = NULL; + XL_RETAIN(loc); orig = pthread_getspecific(__locale_key); - pthread_setspecific(__locale_key, loc == LC_GLOBAL_LOCALE ? NULL : loc); + pthread_setspecific(__locale_key, loc); + XL_RELEASE(orig); } return (orig ? orig : LC_GLOBAL_LOCALE); } diff --git a/pthreads/pthread.c b/pthreads/pthread.c index 0936752..5e9aefb 100644 --- a/pthreads/pthread.c +++ b/pthreads/pthread.c @@ -222,7 +222,7 @@ _________________________________________ __private_extern__ void _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), void * funarg, size_t stacksize, unsigned int flags); -__private_extern__ +__private_extern__ void _pthread_wqthread(pthread_t self, mach_port_t kport, void * stackaddr, pthread_workitem_t item, int reuse); #define PTHREAD_START_CUSTOM 0x01000000 @@ -836,9 +836,9 @@ _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), void * f if ((pflags & PTHREAD_START_CUSTOM) == 0) { stackaddr = (char *)self; _pthread_struct_init(self, attrs, stackaddr, stacksize, 1, 1); - #if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) _pthread_set_self(self); - #endif +#endif LOCK(_pthread_list_lock); if (pflags & PTHREAD_START_SETSCHED) { self->policy = ((pflags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK); @@ -850,9 +850,9 @@ _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), void * f self->detached |= PTHREAD_CREATE_DETACHED; } } else { - #if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) _pthread_set_self(self); - #endif +#endif LOCK(_pthread_list_lock); } self->kernel_thread = kport; @@ -2090,9 +2090,6 @@ pthread_init(void) __oldstyle = 1; } #endif -#if defined(__arm__) - __oldstyle = 1; -#endif #if defined(_OBJC_PAGE_BASE_ADDRESS) { @@ -2110,7 +2107,7 @@ pthread_init(void) mig_init(1); /* enable multi-threaded mig interfaces */ if (__oldstyle == 0) { -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) __bsdthread_register(thread_start, start_wqthread, round_page(sizeof(struct _pthread)), _pthread_start, &workq_targetconc[0], (__uint64_t)(&thread->tsd[__PTK_LIBDISPATCH_KEY0]) - (__uint64_t)thread); #else __bsdthread_register(_pthread_start, _pthread_wqthread, round_page(sizeof(struct _pthread)), NULL, &workq_targetconc[0], (__uint64_t)&thread->tsd[__PTK_LIBDISPATCH_KEY0] - (__uint64_t)thread); @@ -2493,7 +2490,7 @@ pthread_workqueue_atfork_parent(void) void pthread_workqueue_atfork_child(void) { -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) /* * NOTE: workq additions here * are for i386,x86_64 only as @@ -2517,7 +2514,7 @@ _pthread_work_internal_init(void) pthread_workqueue_t wq; if (kernel_workq_setup == 0) { -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) __bsdthread_register(thread_start, start_wqthread, round_page(sizeof(struct _pthread)),NULL,NULL, NULL); #else __bsdthread_register(_pthread_start, _pthread_wqthread, round_page(sizeof(struct _pthread)),NULL,NULL, NULL); @@ -2913,7 +2910,7 @@ _pthread_wqthread(pthread_t self, mach_port_t kport, void * stackaddr, pthread_w /* These are not joinable threads */ self->detached &= ~PTHREAD_CREATE_JOINABLE; self->detached |= PTHREAD_CREATE_DETACHED; -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) _pthread_set_self(self); #endif #if WQ_TRACE @@ -3094,10 +3091,6 @@ pthread_workqueue_create_np(pthread_workqueue_t * workqp, const pthread_workqueu pthread_workqueue_t wq; pthread_workqueue_head_t headp; -#if defined(__arm__) - /* not supported under arm */ - return(ENOTSUP); -#endif #if defined(__ppc__) IF_ROSETTA() { return(ENOTSUP); diff --git a/pthreads/pthread_machdep.h b/pthreads/pthread_machdep.h index 6ec9899..819df97 100644 --- a/pthreads/pthread_machdep.h +++ b/pthreads/pthread_machdep.h @@ -227,10 +227,12 @@ _pthread_getspecific_direct(unsigned long slot) #elif defined(__ppc64__) register void **__pthread_tsd asm ("r13"); ret = __pthread_tsd[slot + (_PTHREAD_TSD_OFFSET / sizeof(void *))]; +#elif defined(__arm__) && defined(_ARM_ARCH_6) && !defined(_ARM_ARCH_7) && defined(__thumb__) && !defined(__OPTIMIZE__) + ret = pthread_getspecific(slot); #elif defined(__arm__) && defined(_ARM_ARCH_6) - void **__pthread_tsd; - __asm__ ("mrc p15, 0, %0, c13, c0, 3" : "=r"(__pthread_tsd)); - ret = __pthread_tsd[slot + (_PTHREAD_TSD_OFFSET / sizeof(void *))]; + void **__pthread_tsd; + __asm__ ("mrc p15, 0, %0, c13, c0, 3" : "=r"(__pthread_tsd)); + ret = __pthread_tsd[slot + (_PTHREAD_TSD_OFFSET / sizeof(void *))]; #elif defined(__arm__) && !defined(_ARM_ARCH_6) register void **__pthread_tsd asm ("r9"); ret = __pthread_tsd[slot + (_PTHREAD_TSD_OFFSET / sizeof(void *))];