/*
 * Copyright (c) 2009 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

/*****************************************************************************
 * Cortex-A8 implementation                                                  *
 *****************************************************************************/
 
// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
//
// Our tests have shown that NEON is always a performance win for memcpy( ).
// However, for the specific case of copies from a warm source to a cold
// destination when the buffer size is between 1k and 32k, it is not enough
// of a performance win to offset the increased power footprint, resulting
// in an energy usage regression.  Thus, we detect that particular case, and
// pass those copies through the ARM core registers.  All other copies larger
// than 8 bytes are handled on NEON.
//
// Stephen Canon, August 2009

.text
.code 16
.syntax unified

// void bcopy(const void * source,
//            void * destination,
//            size_t length);
//
// void *memmove(void * destination,
//               const void * source,
//               size_t n);
//
// void *memcpy(void * restrict destination,
//              const void * restrict source,
//              size_t n);
//
// all copy n successive bytes from source to destination. memmove and memcpy
// returns destination, whereas bcopy has no return value. copying takes place
// as if it were through a temporary buffer -- after return destination contains
// exactly the bytes from source, even if the buffers overlap.

.thumb_func _bcopy
.globl _bcopy    
.thumb_func _memmove
.globl _memmove
.thumb_func _memcpy
.globl _memcpy

.align 2
_bcopy:
	mov       r3,      r0           // swap the first and second arguments
	mov       r0,      r1           // and fall through into memmove
	mov       r1,      r3           //

.align 2
_memmove:
_memcpy:
    subs      r3,      r0,  r1      // offset = destination addr - source addr
    it        eq
    bxeq      lr                    // if source == destination, early out

//  Our preference is for using a (faster) front-to-back copy.  However, if
//  0 < offset < length, it is necessary to copy back-to-front for correctness.
//  We have already ruled out offset == 0, so we can use an unsigned compare
//  with length -- if offset is higher, offset is either greater than length
//  or negative.

    cmp       r3,      r2
    bhs       L_copyFrontToBack
                             
/*****************************************************************************
 *  back to front copy                                                       *
 *****************************************************************************/

    mov       ip,      r0           // copy destination pointer.
    add       r1,           r2      // move source pointer to end of source array
    add       ip,           r2      // move destination pointer to end of dest array
    
    subs      r2,           $8      // if length - 8 is negative (i.e. length
    blt       L_scalarReverseCopy   // is less than 8), jump to cleanup path.
    tst       ip,           $7      // if (destination + length) is doubleword
    beq       L_vectorReverseCopy   // aligned, jump to fast path.
    
0:  ldrb      r3,     [r1, $-1]!    // load byte
    sub       r2,           $1      // decrement length
    strb      r3,     [ip, $-1]!    // store byte
    tst       ip,           $7      // test alignment
    bne       0b
    
    cmp       r2,           $0      // if length - 8 is negative,
    blt       L_scalarReverseCopy   // jump to the cleanup code
                                    
/*****************************************************************************
 *  destination is doubleword aligned                                        *
 *****************************************************************************/

L_vectorReverseCopy:
    ands      r3,      r1,  $3      // Extract the alignment of the source
    bic       r1,           $3
    tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
0:  
.short (L_reverseAligned0-0b)/2     // The NEON alignment hardware does not work
.short (L_reverseAligned1-0b)/2     // properly with sub 4-byte alignment and
.short (L_reverseAligned2-0b)/2     // buffers that are uncacheable, so we need
.short (L_reverseAligned3-0b)/2     // to have a software workaround.

/*****************************************************************************
 *  source is also at least word aligned                                     *
 *****************************************************************************/
    
L_reverseAligned0:
    subs      r2,           $0x38   // if length - 64 is negative, jump to
    blt       L_reverseVectorCleanup// the cleanup path.
    tst       ip,           $0x38   // if (destination + length) is cacheline
    beq       L_reverseCachelineAligned // aligned, jump to the fast path.
    
0:  sub       r1,           $8      // copy eight bytes at a time until the
    vld1.32  {d0},    [r1]          // destination is 8 byte aligned.
    sub       ip,           $8      //
    sub       r2,           $8      //
    tst       ip,           $0x38   //
    vst1.64  {d0},    [ip, :64]     //
    bne       0b                    //
    
    cmp       r2,           $0      // if length - 64 is negative,
    blt       L_reverseVectorCleanup// jump to the cleanup code
    
L_reverseCachelineAligned:
    sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
    cmp       r3,          $0x7c00  // register copies instead of NEON to
    blo       L_useSTMDB            // control energy usage.
    
    sub       r1,           $32     // decrement source
    sub       ip,           $32     // decrement destination
    mov       r3,           $-32    // load address increment
    tst       r1,           $0x1f   // if source shares 32 byte alignment
    beq       L_reverseSourceAligned// jump to loop with more alignment hints
    
    vld1.32  {q2,q3}, [r1], r3      // This loop handles 4-byte aligned copies
    vld1.32  {q0,q1}, [r1], r3      // as generally as possible.
    subs      r2,           $64     // 
    vst1.64  {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always
    blt       1f                    // properly handle misalignment in vld1
.align 3                            // with an element size of 8 or 16, so
0:  vld1.32  {q2,q3}, [r1], r3      // this is the best we can do without
    vst1.64  {q0,q1}, [ip,:256], r3 // handling alignment in software.
    vld1.32   {q0,q1}, [r1], r3     // 
    subs      r2,           $64     // 
    vst1.64  {q2,q3}, [ip,:256], r3 // 
    bge       0b                    // 
    b         1f                    // 
    
L_reverseSourceAligned:
    vld1.64  {q2,q3}, [r1,:256], r3 // Identical to loop above except for
    vld1.64  {q0,q1}, [r1,:256], r3 // additional alignment information; this
    subs      r2,           $64     // gets an additional .5 bytes per cycle
    vst1.64  {q2,q3}, [ip,:256], r3 // on Cortex-A8.
    blt       1f                    // 
.align 3                            // 
0:  vld1.64  {q2,q3}, [r1,:256], r3 //
    vst1.64  {q0,q1}, [ip,:256], r3 //
    vld1.64  {q0,q1}, [r1,:256], r3 //
    subs      r2,           $64     //
    vst1.64  {q2,q3}, [ip,:256], r3 //
    bge       0b                    //
1:  vst1.64  {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store
    add       r1,           $32     // point source at last element stored
    add       ip,           $32     // point destination at last element stored
    
L_reverseVectorCleanup:
    adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
    blt       L_scalarReverseCopy   //

0:  sub       r1,           $8      // copy eight bytes at a time until
    vld1.32  {d0},    [r1]          // (length - 8) < 0.
    sub       ip,           $8      //
    subs      r2,           $8      //
    vst1.64  {d0},    [ip, :64]     //
    bge       0b                    //

/*****************************************************************************
 *  sub-doubleword cleanup copies                                            *
 *****************************************************************************/

L_scalarReverseCopy:
    adds      r2,           #0x8    // restore length
    it        eq                    // if this is zero
    bxeq      lr                    // early out
         
0:  ldrb      r3,     [r1, #-1]!    // load a byte from source
    strb      r3,     [ip, #-1]!    // store to destination
    subs      r2,           #0x1    // subtract one from length
    bne       0b                    // if non-zero, repeat
    bx        lr                    // return
         
/*****************************************************************************
 *  STMDB loop for 1k-32k buffers                                            *
 *****************************************************************************/

L_useSTMDB:
    push     {r4-r8,r10,r11}
.align 3
0:  ldmdb	  r1!,  {r3-r8,r10,r11}
    subs      r2,           #0x40
    stmdb     ip!,  {r3-r8,r10,r11}
    ldmdb	  r1!,  {r3-r8,r10,r11}
	pld		 [r1, #-0x40]
    stmdb     ip!,  {r3-r8,r10,r11}
    bge       0b
    pop      {r4-r8,r10,r11}
    b         L_reverseVectorCleanup
    
/*****************************************************************************
 *  Misaligned vld1 loop                                                     *
 *****************************************************************************/

// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes.  Load two 4-byte aligned double words from source, 
// use vext.8 to extract a double word to store, and perform an 8-byte aligned
// store to destination.

#define RCOPY_UNALIGNED(offset)      \
    subs      r2,          $8       ;\
    blt       2f                    ;\
    sub       r1,          $8       ;\
    sub       ip,          $8       ;\
    mov       r3,          $-8      ;\
    vld1.32  {d2,d3}, [r1], r3      ;\
    subs      r2,          $8       ;\
    blt       1f                    ;\
0:  vext.8    d0,  d2, d3, $(offset);\
    vmov      d3,      d2           ;\
    vld1.32  {d2},    [r1], r3      ;\
    subs      r2,          $8       ;\
    vst1.64  {d0},    [ip, :64], r3 ;\
    bge       0b                    ;\
1:  vext.8    d0,  d2, d3, $(offset);\
    add       r1,          $8       ;\
    vst1.64  {d0},    [ip, :64]     ;\
2:  add       r2,          $8       ;\
    add       r1,          $(offset);\
    b         L_scalarReverseCopy

L_reverseAligned1:
    RCOPY_UNALIGNED(1)
L_reverseAligned2:
    RCOPY_UNALIGNED(2)
L_reverseAligned3:
    RCOPY_UNALIGNED(3)

/*****************************************************************************
 *  front to back copy                                                       *
 *****************************************************************************/

L_copyFrontToBack:
    mov       ip,      r0           // copy destination pointer.
    subs      r2,           $8      // if length - 8 is negative (i.e. length
    blt       L_scalarCopy          // is less than 8), jump to cleanup path.
    tst       ip,           $7      // if the destination is doubleword
    beq       L_vectorCopy          // aligned, jump to fast path.
    
0:  ldrb      r3,     [r1], $1      // load byte
    sub       r2,           $1      // decrement length
    strb      r3,     [ip], $1      // store byte
    tst       ip,           $7      // test alignment
    bne       0b
    
    cmp       r2,           $0      // if length - 8 is negative,
    blt       L_scalarCopy          // jump to the cleanup code
    
/*****************************************************************************
 *  destination is doubleword aligned                                        *
 *****************************************************************************/

L_vectorCopy:
    ands      r3,      r1,  $3      // Extract the alignment of the source
    bic       r1,           $3
    tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
0:  
.short (L_sourceAligned0-0b)/2      // The NEON alignment hardware does not work
.short (L_sourceAligned1-0b)/2      // properly with sub 4-byte alignment and
.short (L_sourceAligned2-0b)/2      // buffers that are uncacheable, so we need
.short (L_sourceAligned3-0b)/2      // to have a software workaround.

/*****************************************************************************
 *  source is also at least word aligned                                     *
 *****************************************************************************/
    
L_sourceAligned0:
    subs      r2,           $0x38   // If (length - 64) < 0
    blt       L_vectorCleanup       //   jump to cleanup code
    tst       ip,           $0x38   // If destination is 64 byte aligned
    beq       L_cachelineAligned    //   jump to main loop
    
0:  vld1.32  {d0},    [r1]!         // Copy one double word at a time until
    sub       r2,           $8      // the destination is 64-byte aligned.
    vst1.64  {d0},    [ip, :64]!    //
    tst       ip,           $0x38   //
    bne       0b                    //
    
    cmp       r2,           $0      // If (length - 64) < 0, goto cleanup
    blt       L_vectorCleanup       //
    
L_cachelineAligned:
    sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
    cmp       r3,          $0x7c00  // register copies instead of NEON to
    blo       L_useSTMIA            // control energy usage.
    tst       r1,           $0x1f   // If source has 32-byte alignment, use
    beq       L_sourceAligned32     // an optimized loop.
    
    vld1.32  {q2,q3}, [r1]!         // This is the most common path for small
    vld1.32  {q0,q1}, [r1]!         // copies, which are alarmingly frequent.
    subs      r2,           #0x40   // It requires 4-byte alignment on the
    vst1.64  {q2,q3}, [ip, :256]!   // source.  For ordinary malloc'd buffers,
    blt       1f                    // this path could handle only single-byte
.align 3                            // alignment at speed by using vld1.8
0:  vld1.32  {q2,q3}, [r1]!         // instead of vld1.32; however, the NEON
    vst1.64  {q0,q1}, [ip, :256]!   // alignment handler misbehaves for some
    vld1.32  {q0,q1}, [r1]!         // special copies if the element size is
    subs      r2,           #0x40   // 8 or 16, so we need to work around
    vst1.64  {q2,q3}, [ip, :256]!   // sub 4-byte alignment in software, in
    bge       0b                    // another code path.
    b         1f
    
L_sourceAligned32:
    vld1.64  {q2,q3}, [r1, :256]!   // When the source shares 32-byte alignment
    vld1.64  {q0,q1}, [r1, :256]!   // with the destination, we use this loop
    subs      r2,           #0x40   // instead, which specifies the maximum
    vst1.64  {q2,q3}, [ip, :256]!   // :256 alignment on all loads and stores.
    blt       1f                    // 
.align 3                            // This gets an additional .5 bytes per
0:  vld1.64  {q2,q3}, [r1, :256]!   // cycle for in-cache copies, which is not
    vst1.64  {q0,q1}, [ip, :256]!   // insignificant for this (rather common)
    vld1.64  {q0,q1}, [r1, :256]!   // case.
    subs      r2,           #0x40   // 
    vst1.64  {q2,q3}, [ip, :256]!   // This is identical to the above loop,
    bge       0b                    // except for the additional alignment.
1:  vst1.64  {q0,q1}, [ip, :256]!   // 

L_vectorCleanup:
    adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
    blt       L_scalarCopy          //
    
0:  vld1.32  {d0},    [r1]!         // Copy one doubleword at a time until
    subs      r2,           $8      // (length - 8) < 0.
    vst1.64  {d0},    [ip, :64]!    //
    bge       0b                    //

/*****************************************************************************
 *  sub-doubleword cleanup copies                                            *
 *****************************************************************************/

L_scalarCopy:
    adds      r2,           #0x8    // restore length
    it        eq                    // if this is zero
    bxeq      lr                    // early out
         
0:  ldrb      r3,     [r1], #1      // load a byte from source
    strb      r3,     [ip], #1      // store to destination
    subs      r2,           #1      // subtract one from length
    bne       0b                    // if non-zero, repeat
    bx        lr                    // return
    
/*****************************************************************************
 *  STMIA loop for 1k-32k buffers                                            *
 *****************************************************************************/

L_useSTMIA:
    push     {r4-r8,r10,r11}
.align 3
0:  ldmia     r1!,  {r3-r8,r10,r11}
    subs      r2,      r2,  #64
    stmia     ip!,  {r3-r8,r10,r11}
    ldmia     r1!,  {r3-r8,r10,r11}
    pld      [r1, #64]
    stmia     ip!,  {r3-r8,r10,r11}
    bge       0b
    pop      {r4-r8,r10,r11}
    b         L_vectorCleanup
    
/*****************************************************************************
 *  Misaligned reverse vld1 loop                                             *
 *****************************************************************************/

// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes.  Load two 4-byte aligned double words from source, 
// use vext.8 to extract a double word to store, and perform an 8-byte aligned
// store to destination.

#define COPY_UNALIGNED(offset)       \
    subs      r2,          $8       ;\
    blt       2f                    ;\
    vld1.32  {d2,d3}, [r1]!         ;\
    subs      r2,          $8       ;\
    blt       1f                    ;\
0:  vext.8    d0,  d2, d3, $(offset);\
    vmov      d2,      d3           ;\
    vld1.32  {d3},    [r1]!         ;\
    subs      r2,          $8       ;\
    vst1.64  {d0},    [ip, :64]!    ;\
    bge       0b                    ;\
1:  vext.8    d0,  d2, d3, $(offset);\
    sub       r1,          $8       ;\
    vst1.64  {d0},    [ip, :64]!    ;\
2:  add       r1,          $(offset);\
    add       r2,          $8       ;\
    b         L_scalarCopy

L_sourceAligned1:
    COPY_UNALIGNED(1)
L_sourceAligned2:
    COPY_UNALIGNED(2)
L_sourceAligned3:
    COPY_UNALIGNED(3)