Libc-594.1.4.tar.gz

[apple/libc.git] / arm / string / NEON / bcopy.s
diff --git a/arm/string/NEON/bcopy.s b/arm/string/NEON/bcopy.s

new file mode 100644 (file)

index 0000000..30abab1
--- /dev/null
+++ b/arm/string/NEON/bcopy.s
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+/*****************************************************************************
+ * Cortex-A8 implementation                                                  *
+ *****************************************************************************/
+ 
+// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
+//
+// Our tests have shown that NEON is always a performance win for memcpy( ).
+// However, for the specific case of copies from a warm source to a cold
+// destination when the buffer size is between 1k and 32k, it is not enough
+// of a performance win to offset the increased power footprint, resulting
+// in an energy usage regression.  Thus, we detect that particular case, and
+// pass those copies through the ARM core registers.  All other copies larger
+// than 8 bytes are handled on NEON.
+//
+// Stephen Canon, August 2009
+
+.text
+.code 16
+.syntax unified
+
+// void bcopy(const void * source,
+//            void * destination,
+//            size_t length);
+//
+// void *memmove(void * destination,
+//               const void * source,
+//               size_t n);
+//
+// void *memcpy(void * restrict destination,
+//              const void * restrict source,
+//              size_t n);
+//
+// all copy n successive bytes from source to destination. memmove and memcpy
+// returns destination, whereas bcopy has no return value. copying takes place
+// as if it were through a temporary buffer -- after return destination contains
+// exactly the bytes from source, even if the buffers overlap.
+
+.thumb_func _bcopy
+.globl _bcopy    
+.thumb_func _memmove
+.globl _memmove
+.thumb_func _memcpy
+.globl _memcpy
+
+.align 2
+_bcopy:
+       mov       r3,      r0           // swap the first and second arguments
+       mov       r0,      r1           // and fall through into memmove
+       mov       r1,      r3           //
+
+.align 2
+_memmove:
+_memcpy:
+    subs      r3,      r0,  r1      // offset = destination addr - source addr
+    it        eq
+    bxeq      lr                    // if source == destination, early out
+
+//  Our preference is for using a (faster) front-to-back copy.  However, if
+//  0 < offset < length, it is necessary to copy back-to-front for correctness.
+//  We have already ruled out offset == 0, so we can use an unsigned compare
+//  with length -- if offset is higher, offset is either greater than length
+//  or negative.
+
+    cmp       r3,      r2
+    bhs       L_copyFrontToBack
+                             
+/*****************************************************************************
+ *  back to front copy                                                       *
+ *****************************************************************************/
+
+    mov       ip,      r0           // copy destination pointer.
+    add       r1,           r2      // move source pointer to end of source array
+    add       ip,           r2      // move destination pointer to end of dest array
+    
+    subs      r2,           $8      // if length - 8 is negative (i.e. length
+    blt       L_scalarReverseCopy   // is less than 8), jump to cleanup path.
+    tst       ip,           $7      // if (destination + length) is doubleword
+    beq       L_vectorReverseCopy   // aligned, jump to fast path.
+    
+0:  ldrb      r3,     [r1, $-1]!    // load byte
+    sub       r2,           $1      // decrement length
+    strb      r3,     [ip, $-1]!    // store byte
+    tst       ip,           $7      // test alignment
+    bne       0b
+    
+    cmp       r2,           $0      // if length - 8 is negative,
+    blt       L_scalarReverseCopy   // jump to the cleanup code
+                                    
+/*****************************************************************************
+ *  destination is doubleword aligned                                        *
+ *****************************************************************************/
+
+L_vectorReverseCopy:
+    ands      r3,      r1,  $3      // Extract the alignment of the source
+    bic       r1,           $3
+    tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
+0:  
+.short (L_reverseAligned0-0b)/2     // The NEON alignment hardware does not work
+.short (L_reverseAligned1-0b)/2     // properly with sub 4-byte alignment and
+.short (L_reverseAligned2-0b)/2     // buffers that are uncacheable, so we need
+.short (L_reverseAligned3-0b)/2     // to have a software workaround.
+
+/*****************************************************************************
+ *  source is also at least word aligned                                     *
+ *****************************************************************************/
+    
+L_reverseAligned0:
+    subs      r2,           $0x38   // if length - 64 is negative, jump to
+    blt       L_reverseVectorCleanup// the cleanup path.
+    tst       ip,           $0x38   // if (destination + length) is cacheline
+    beq       L_reverseCachelineAligned // aligned, jump to the fast path.
+    
+0:  sub       r1,           $8      // copy eight bytes at a time until the
+    vld1.32  {d0},    [r1]          // destination is 8 byte aligned.
+    sub       ip,           $8      //
+    sub       r2,           $8      //
+    tst       ip,           $0x38   //
+    vst1.64  {d0},    [ip, :64]     //
+    bne       0b                    //
+    
+    cmp       r2,           $0      // if length - 64 is negative,
+    blt       L_reverseVectorCleanup// jump to the cleanup code
+    
+L_reverseCachelineAligned:
+    sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
+    cmp       r3,          $0x7c00  // register copies instead of NEON to
+    blo       L_useSTMDB            // control energy usage.
+    
+    sub       r1,           $32     // decrement source
+    sub       ip,           $32     // decrement destination
+    mov       r3,           $-32    // load address increment
+    tst       r1,           $0x1f   // if source shares 32 byte alignment
+    beq       L_reverseSourceAligned// jump to loop with more alignment hints
+    
+    vld1.32  {q2,q3}, [r1], r3      // This loop handles 4-byte aligned copies
+    vld1.32  {q0,q1}, [r1], r3      // as generally as possible.
+    subs      r2,           $64     // 
+    vst1.64  {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always
+    blt       1f                    // properly handle misalignment in vld1
+.align 3                            // with an element size of 8 or 16, so
+0:  vld1.32  {q2,q3}, [r1], r3      // this is the best we can do without
+    vst1.64  {q0,q1}, [ip,:256], r3 // handling alignment in software.
+    vld1.32   {q0,q1}, [r1], r3     // 
+    subs      r2,           $64     // 
+    vst1.64  {q2,q3}, [ip,:256], r3 // 
+    bge       0b                    // 
+    b         1f                    // 
+    
+L_reverseSourceAligned:
+    vld1.64  {q2,q3}, [r1,:256], r3 // Identical to loop above except for
+    vld1.64  {q0,q1}, [r1,:256], r3 // additional alignment information; this
+    subs      r2,           $64     // gets an additional .5 bytes per cycle
+    vst1.64  {q2,q3}, [ip,:256], r3 // on Cortex-A8.
+    blt       1f                    // 
+.align 3                            // 
+0:  vld1.64  {q2,q3}, [r1,:256], r3 //
+    vst1.64  {q0,q1}, [ip,:256], r3 //
+    vld1.64  {q0,q1}, [r1,:256], r3 //
+    subs      r2,           $64     //
+    vst1.64  {q2,q3}, [ip,:256], r3 //
+    bge       0b                    //
+1:  vst1.64  {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store
+    add       r1,           $32     // point source at last element stored
+    add       ip,           $32     // point destination at last element stored
+    
+L_reverseVectorCleanup:
+    adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
+    blt       L_scalarReverseCopy   //
+
+0:  sub       r1,           $8      // copy eight bytes at a time until
+    vld1.32  {d0},    [r1]          // (length - 8) < 0.
+    sub       ip,           $8      //
+    subs      r2,           $8      //
+    vst1.64  {d0},    [ip, :64]     //
+    bge       0b                    //
+
+/*****************************************************************************
+ *  sub-doubleword cleanup copies                                            *
+ *****************************************************************************/
+
+L_scalarReverseCopy:
+    adds      r2,           #0x8    // restore length
+    it        eq                    // if this is zero
+    bxeq      lr                    // early out
+         
+0:  ldrb      r3,     [r1, #-1]!    // load a byte from source
+    strb      r3,     [ip, #-1]!    // store to destination
+    subs      r2,           #0x1    // subtract one from length
+    bne       0b                    // if non-zero, repeat
+    bx        lr                    // return
+         
+/*****************************************************************************
+ *  STMDB loop for 1k-32k buffers                                            *
+ *****************************************************************************/
+
+L_useSTMDB:
+    push     {r4-r8,r10,r11}
+.align 3
+0:  ldmdb        r1!,  {r3-r8,r10,r11}
+    subs      r2,           #0x40
+    stmdb     ip!,  {r3-r8,r10,r11}
+    ldmdb        r1!,  {r3-r8,r10,r11}
+       pld              [r1, #-0x40]
+    stmdb     ip!,  {r3-r8,r10,r11}
+    bge       0b
+    pop      {r4-r8,r10,r11}
+    b         L_reverseVectorCleanup
+    
+/*****************************************************************************
+ *  Misaligned vld1 loop                                                     *
+ *****************************************************************************/
+
+// Software alignment fixup to handle source and dest that are relatively
+// misaligned mod 4 bytes.  Load two 4-byte aligned double words from source, 
+// use vext.8 to extract a double word to store, and perform an 8-byte aligned
+// store to destination.
+
+#define RCOPY_UNALIGNED(offset)      \
+    subs      r2,          $8       ;\
+    blt       2f                    ;\
+    sub       r1,          $8       ;\
+    sub       ip,          $8       ;\
+    mov       r3,          $-8      ;\
+    vld1.32  {d2,d3}, [r1], r3      ;\
+    subs      r2,          $8       ;\
+    blt       1f                    ;\
+0:  vext.8    d0,  d2, d3, $(offset);\
+    vmov      d3,      d2           ;\
+    vld1.32  {d2},    [r1], r3      ;\
+    subs      r2,          $8       ;\
+    vst1.64  {d0},    [ip, :64], r3 ;\
+    bge       0b                    ;\
+1:  vext.8    d0,  d2, d3, $(offset);\
+    add       r1,          $8       ;\
+    vst1.64  {d0},    [ip, :64]     ;\
+2:  add       r2,          $8       ;\
+    add       r1,          $(offset);\
+    b         L_scalarReverseCopy
+
+L_reverseAligned1:
+    RCOPY_UNALIGNED(1)
+L_reverseAligned2:
+    RCOPY_UNALIGNED(2)
+L_reverseAligned3:
+    RCOPY_UNALIGNED(3)
+
+/*****************************************************************************
+ *  front to back copy                                                       *
+ *****************************************************************************/
+
+L_copyFrontToBack:
+    mov       ip,      r0           // copy destination pointer.
+    subs      r2,           $8      // if length - 8 is negative (i.e. length
+    blt       L_scalarCopy          // is less than 8), jump to cleanup path.
+    tst       ip,           $7      // if the destination is doubleword
+    beq       L_vectorCopy          // aligned, jump to fast path.
+    
+0:  ldrb      r3,     [r1], $1      // load byte
+    sub       r2,           $1      // decrement length
+    strb      r3,     [ip], $1      // store byte
+    tst       ip,           $7      // test alignment
+    bne       0b
+    
+    cmp       r2,           $0      // if length - 8 is negative,
+    blt       L_scalarCopy          // jump to the cleanup code
+    
+/*****************************************************************************
+ *  destination is doubleword aligned                                        *
+ *****************************************************************************/
+
+L_vectorCopy:
+    ands      r3,      r1,  $3      // Extract the alignment of the source
+    bic       r1,           $3
+    tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
+0:  
+.short (L_sourceAligned0-0b)/2      // The NEON alignment hardware does not work
+.short (L_sourceAligned1-0b)/2      // properly with sub 4-byte alignment and
+.short (L_sourceAligned2-0b)/2      // buffers that are uncacheable, so we need
+.short (L_sourceAligned3-0b)/2      // to have a software workaround.
+
+/*****************************************************************************
+ *  source is also at least word aligned                                     *
+ *****************************************************************************/
+    
+L_sourceAligned0:
+    subs      r2,           $0x38   // If (length - 64) < 0
+    blt       L_vectorCleanup       //   jump to cleanup code
+    tst       ip,           $0x38   // If destination is 64 byte aligned
+    beq       L_cachelineAligned    //   jump to main loop
+    
+0:  vld1.32  {d0},    [r1]!         // Copy one double word at a time until
+    sub       r2,           $8      // the destination is 64-byte aligned.
+    vst1.64  {d0},    [ip, :64]!    //
+    tst       ip,           $0x38   //
+    bne       0b                    //
+    
+    cmp       r2,           $0      // If (length - 64) < 0, goto cleanup
+    blt       L_vectorCleanup       //
+    
+L_cachelineAligned:
+    sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
+    cmp       r3,          $0x7c00  // register copies instead of NEON to
+    blo       L_useSTMIA            // control energy usage.
+    tst       r1,           $0x1f   // If source has 32-byte alignment, use
+    beq       L_sourceAligned32     // an optimized loop.
+    
+    vld1.32  {q2,q3}, [r1]!         // This is the most common path for small
+    vld1.32  {q0,q1}, [r1]!         // copies, which are alarmingly frequent.
+    subs      r2,           #0x40   // It requires 4-byte alignment on the
+    vst1.64  {q2,q3}, [ip, :256]!   // source.  For ordinary malloc'd buffers,
+    blt       1f                    // this path could handle only single-byte
+.align 3                            // alignment at speed by using vld1.8
+0:  vld1.32  {q2,q3}, [r1]!         // instead of vld1.32; however, the NEON
+    vst1.64  {q0,q1}, [ip, :256]!   // alignment handler misbehaves for some
+    vld1.32  {q0,q1}, [r1]!         // special copies if the element size is
+    subs      r2,           #0x40   // 8 or 16, so we need to work around
+    vst1.64  {q2,q3}, [ip, :256]!   // sub 4-byte alignment in software, in
+    bge       0b                    // another code path.
+    b         1f
+    
+L_sourceAligned32:
+    vld1.64  {q2,q3}, [r1, :256]!   // When the source shares 32-byte alignment
+    vld1.64  {q0,q1}, [r1, :256]!   // with the destination, we use this loop
+    subs      r2,           #0x40   // instead, which specifies the maximum
+    vst1.64  {q2,q3}, [ip, :256]!   // :256 alignment on all loads and stores.
+    blt       1f                    // 
+.align 3                            // This gets an additional .5 bytes per
+0:  vld1.64  {q2,q3}, [r1, :256]!   // cycle for in-cache copies, which is not
+    vst1.64  {q0,q1}, [ip, :256]!   // insignificant for this (rather common)
+    vld1.64  {q0,q1}, [r1, :256]!   // case.
+    subs      r2,           #0x40   // 
+    vst1.64  {q2,q3}, [ip, :256]!   // This is identical to the above loop,
+    bge       0b                    // except for the additional alignment.
+1:  vst1.64  {q0,q1}, [ip, :256]!   // 
+
+L_vectorCleanup:
+    adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
+    blt       L_scalarCopy          //
+    
+0:  vld1.32  {d0},    [r1]!         // Copy one doubleword at a time until
+    subs      r2,           $8      // (length - 8) < 0.
+    vst1.64  {d0},    [ip, :64]!    //
+    bge       0b                    //
+
+/*****************************************************************************
+ *  sub-doubleword cleanup copies                                            *
+ *****************************************************************************/
+
+L_scalarCopy:
+    adds      r2,           #0x8    // restore length
+    it        eq                    // if this is zero
+    bxeq      lr                    // early out
+         
+0:  ldrb      r3,     [r1], #1      // load a byte from source
+    strb      r3,     [ip], #1      // store to destination
+    subs      r2,           #1      // subtract one from length
+    bne       0b                    // if non-zero, repeat
+    bx        lr                    // return
+    
+/*****************************************************************************
+ *  STMIA loop for 1k-32k buffers                                            *
+ *****************************************************************************/
+
+L_useSTMIA:
+    push     {r4-r8,r10,r11}
+.align 3
+0:  ldmia     r1!,  {r3-r8,r10,r11}
+    subs      r2,      r2,  #64
+    stmia     ip!,  {r3-r8,r10,r11}
+    ldmia     r1!,  {r3-r8,r10,r11}
+    pld      [r1, #64]
+    stmia     ip!,  {r3-r8,r10,r11}
+    bge       0b
+    pop      {r4-r8,r10,r11}
+    b         L_vectorCleanup
+    
+/*****************************************************************************
+ *  Misaligned reverse vld1 loop                                             *
+ *****************************************************************************/
+
+// Software alignment fixup to handle source and dest that are relatively
+// misaligned mod 4 bytes.  Load two 4-byte aligned double words from source, 
+// use vext.8 to extract a double word to store, and perform an 8-byte aligned
+// store to destination.
+
+#define COPY_UNALIGNED(offset)       \
+    subs      r2,          $8       ;\
+    blt       2f                    ;\
+    vld1.32  {d2,d3}, [r1]!         ;\
+    subs      r2,          $8       ;\
+    blt       1f                    ;\
+0:  vext.8    d0,  d2, d3, $(offset);\
+    vmov      d2,      d3           ;\
+    vld1.32  {d3},    [r1]!         ;\
+    subs      r2,          $8       ;\
+    vst1.64  {d0},    [ip, :64]!    ;\
+    bge       0b                    ;\
+1:  vext.8    d0,  d2, d3, $(offset);\
+    sub       r1,          $8       ;\
+    vst1.64  {d0},    [ip, :64]!    ;\
+2:  add       r1,          $(offset);\
+    add       r2,          $8       ;\
+    b         L_scalarCopy
+
+L_sourceAligned1:
+    COPY_UNALIGNED(1)
+L_sourceAligned2:
+    COPY_UNALIGNED(2)
+L_sourceAligned3:
+    COPY_UNALIGNED(3)