Libc-825.40.1.tar.gz

[apple/libc.git] / arm / string / bzero_CortexA8.s
diff --git a/arm/string/bzero_CortexA8.s b/arm/string/bzero_CortexA8.s

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..bcbfea042633e7ad7773a0b592dba94e290ff411 100644 (file)
--- a/arm/string/bzero_CortexA8.s
+++ b/arm/string/bzero_CortexA8.s
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+#include <arm/arch.h>
+#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
+
+/**********************************************************************
+ * Cortex-A8 implementation                                           *
+ **********************************************************************/
+
+// Cortex-A8 implementations of memset( ) and bzero( ).  Main loop is 64-byte
+// NEON stores, unless the buffer length is > 1k.  Beyond that point, there is
+// little to no speed advantage with NEON (and a slight regression in some
+// measured cases), so we switch to the GPRs.
+//
+// The crossover point should be reevaluated for future architectures.
+//
+// -- Stephen Canon, August 2009
+
+.text
+.syntax unified
+.code 16
+
+// void bzero(void * destination,
+//            size_t length);
+//
+// zeros out a buffer length bytes long, beginning at the address destination.
+.thumb_func ___bzero$VARIANT$CortexA8
+.globl ___bzero$VARIANT$CortexA8
+.thumb_func _bzero$VARIANT$CortexA8
+.globl _bzero$VARIANT$CortexA8
+.align 2
+___bzero$VARIANT$CortexA8:
+_bzero$VARIANT$CortexA8:
+    mov     r2,     r1              // match the API to memset(dest, 0, length)
+    eor     r1,     r1              // and fall through into memset
+
+// void *memset(void * destination,
+//              int value, size_t n);
+//
+// writes value converted to an unsigned char to n successive bytes, beginning
+// at destination.
+
+// Notes on register usage:
+// 
+// Throughout this function, registers have nearly constant usage; the pattern
+// is:
+//
+//     r0 holds the original destination pointer, unmodified.  This value
+//        must be returned by the routine, so it is easiest to just leave it
+//        in place.
+//     r1 holds the value that is being copied into the buffer, in some stage
+//        of splattedness.  The low byte is guaranteed to always have the value
+//        but the higher bytes may or may not contain copies of it.
+//     r2 holds the length minus some offset, where the offset is always the
+//        number of bytes that the current loop stores per iteration.
+//     r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted
+//        copies of the value to be stored.
+//     ip holds a pointer to the lowest byte in the array that has not yet been
+//        set to hold value.
+//     q0 and q1 hold splatted copies of the value in the vector path, and are
+//        otherwise unused.
+
+.thumb_func _memset$VARIANT$CortexA8
+.globl _memset$VARIANT$CortexA8
+.align 2
+_memset$VARIANT$CortexA8:
+    mov       ip,      r0           // copy destination pointer.
+    subs      r2,           #0x8    // if length - 8 is negative (i.e. length
+    and       r1,           #0xff   // is less than 8), jump to cleanup path.
+    blt       L_scalarCleanup       // 
+    
+    tst       ip,           #0x7    // if the destination is doubleword
+    beq       L_vectorCopy          // aligned, jump to fast path.
+    
+0:  strb      r1,     [ip], #1      // store one byte at a time until
+    sub       r2,           #1      // destination pointer is 8 byte aligned.
+    tst       ip,           #7      //
+    bne       0b                    //
+    
+    cmp       r2,           #0x0    // if length - 8 is negative,
+    blt       L_scalarCleanup       // jump to the cleanup code
+
+L_vectorCopy:
+    vdup.8    q0,      r1           // splat the byte to be stored across
+    subs      r2,           #0x38   // q0 and q1, and check if length - 64
+    vmov      q1,      q0           // is negative; if so, jump to the
+    blt       L_vectorCleanup       // cleanup code.
+    
+    tst       ip,           #0x38   // if the destination is cacheline
+    beq       L_cachelineAligned    // aligned, jump to the fast path.
+
+0:  vst1.64  {d0},    [ip, :64]!    // store one double word at a time until
+    sub       r2,           #8      // the destination is 64-byte aligned
+    tst       ip,           #0x38   // 
+    bne       0b
+    
+    cmp       r2,           #0x0    // if length - 64 is negative,
+    blt       L_vectorCleanup       // jump to the cleanup code
+
+L_cachelineAligned:
+    cmp       r2,           #0x3c0  // if length > 1024
+    bge       L_useSTMIA            // we use stmia instead
+
+.align 4                            // main loop
+0:  vst1.64  {q0,q1}, [ip, :256]!   // store 32 bytes
+    subs      r2,           #0x40   // decrement length by 64
+    vst1.64  {q0,q1}, [ip, :256]!   // store 32 bytes
+    bge       0b                    // if length - 64 >= 0, continue
+    
+L_vectorCleanup:
+    adds      r2,           #0x38   // if (length - 8) < 0, goto scalar cleanup
+    blt       L_scalarCleanup       //
+    
+0:  subs      r2,           #8      // store one double word at a time until
+    vst1.64  {d0},    [ip, :64]!    // (length - 8) < 0.
+    bge       0b
+    
+L_scalarCleanup:
+    adds      r2,           #8      // restore length
+    beq       1f                    // early out if zero.
+    
+0:  strb      r1,     [ip], #1      // store one byte at a time until length
+    subs      r2,           #1      // is zero.
+    bne       0b                    //
+1:  bx        lr                    // return.
+
+//  STMIA loop for large buffers
+//
+//  For stores larger than 1024 bytes, we use STMIA because we can't get enough
+//  of a speedup from NEON to offset the higher power draw of the NEON unit.
+//
+//  This crossover should be reevaluated on future architectures.
+//
+//  We avoid using r7 and r9 even though it's not strictly necessary.
+
+L_useSTMIA:
+    push     {r4,r5,r6,r8,r10,r11}
+    orr       r1,      r1,  r1, lsl #8
+    orr       r1,      r1,  r1, lsl #16
+    mov       r3,      r1
+    mov       r4,      r1
+    mov       r5,      r1
+    mov       r6,      r1
+    mov       r8,      r1
+    mov       r10,     r1
+    mov       r11,     r1
+.align 4
+0:  stmia     ip!,  {r1,r3,r4,r5,r6,r8,r10,r11}
+    subs      r2,           #0x40
+    stmia     ip!,  {r1,r3,r4,r5,r6,r8,r10,r11}
+    bge       0b
+    pop      {r4,r5,r6,r8,r10,r11}
+    b         L_vectorCleanup
+
+#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD
+