Libc-825.40.1.tar.gz

[apple/libc.git] / arm / string / bcopy_CortexA9.s
diff --git a/arm/string/bcopy_CortexA9.s b/arm/string/bcopy_CortexA9.s

index 9b2664f9f08c2efd952295785aa9c89a56734cf7..45f0e2b6420bb0ac4f9903f43545da326c986552 100644 (file)
--- a/arm/string/bcopy_CortexA9.s
+++ b/arm/string/bcopy_CortexA9.s
@@ -43,3 +43,402 @@
   * this function on OS X and iOS).
   */
  
   * this function on OS X and iOS).
   */
  
+#include <arm/arch.h>
+#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
+
+/*****************************************************************************
+ * Macros                                                                    *
+ *****************************************************************************/
+
+#define A9_ENTRY(name) \
+       .align 2;\
+       .globl _ ## name ## $VARIANT$CortexA9;\
+       _ ## name ## $VARIANT$CortexA9:
+
+#define ESTABLISH_FRAME \
+       push   {r0,r4,r7,lr};\
+       add     r7,     sp, #8
+    
+#define CLEAR_FRAME_AND_RETURN \
+       pop    {r0,r4,r7,pc}
+    
+#define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}
+
+#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}
+
+/*****************************************************************************
+ *  entry points                                                             *
+ *****************************************************************************/
+
+.text
+.syntax unified
+.code 32
+
+A9_ENTRY(bcopy)
+//  Translate bcopy calls into memcpy calls by swapping the first and second
+//  arguments.
+       mov     r3,     r0
+       mov     r0,     r1
+       mov     r1,     r3
+
+A9_ENTRY(memcpy)
+A9_ENTRY(memmove)
+//  Our preference is to copy the data in ascending address order, but if the
+//  buffers overlap such that the beginning of the destination buffer aliases
+//  the end of the source buffer, we need to copy in descending address order
+//  instead to preserve the memmove semantics.  We detect this case with the
+//  test:
+//
+//      destination - source < length    (unsigned compare)
+//
+//  If the address of the source buffer is higher than the address of the
+//  destination buffer, this arithmetic can overflow, but the overflowed value
+//  can only be smaller than length if the buffers do not overlap, so we don't
+//  need to worry about false positives due to the overflow (they happen, but
+//  only in cases where copying in either order is correct).
+       subs    r3,     r0, r1
+       bxeq    lr
+       ESTABLISH_FRAME
+       cmp     r3,     r2
+       blo     L_descendingCopy
+
+/*****************************************************************************
+ *  ascending copy                                                           *
+ *****************************************************************************/
+
+//  The layout of the two buffers is such that we can use our preferred
+//  (ascending address order) copy implementation.  Throughout this copy,
+//  registers are used as follows:
+//
+//      r0  lowest unwritten address in the destination buffer.
+//      r1  lowest unread address in the source buffer.
+//      r2  number of bytes remaining to copy less an offset that varies
+//          with the size of the copies that are being made.
+//      r3, r4, r5, r6, r8, r9, r10, r12
+//          temporary registers used to hold the data during copies.
+//      r12 also used as a scratch register for alignment / length calculations
+
+L_ascendingCopy:
+//  We begin by checking if less than four bytes are to be copied; if so, we
+//  branch directly to a small-buffer copy and return.  Otherwise, we copy up
+//  to three bytes if needed to make the destination pointer have word (four
+//  byte) alignment.
+       subs    r2,         #4
+       blo     L_ascendingLengthLessThanFour
+       ands    ip,     r0, #0x3
+       beq     L_ascendingDestinationWordAligned
+       ldrb    r3,    [r1],#1
+       cmp     ip,         #2
+       ldrbls  r4,    [r1],#1
+       strb    r3,    [r0],#1
+       ldrblo  r3,    [r1],#1
+       add     r2,         ip
+       strbls  r4,    [r0],#1
+       strblo  r3,    [r0],#1
+       subs    r2,         #4
+       bhs     L_ascendingDestinationWordAligned
+    
+L_ascendingLengthLessThanFour:
+//  Conditionally copies up to three bytes, assuming no alignment.  This is
+//  only used if the original length of the buffer is smaller than four.
+       lsls    ip,     r2, #31
+       ldrbcs  r3,    [r1],#1
+       ldrbcs  ip,    [r1],#1
+       ldrbmi  r4,    [r1]
+       strbcs  r3,    [r0],#1
+       strbcs  ip,    [r0],#1
+       strbmi  r4,    [r0]
+       CLEAR_FRAME_AND_RETURN
+    
+L_ascendingDestinationWordAligned:
+//  We know that the destination has word alignment.  If the source is not
+//  similarly aligned, jump to an unaligned copy loop.
+       tst     r1,         #0x3
+       bne             L_ascendingUnalignedCopy
+
+/*****************************************************************************
+ *  ascending copy, both buffers have word alignment                         *
+ *****************************************************************************/
+    
+//  If less than sixty-four bytes remain to be copied, jump directly to the
+//  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
+//  to make the destination pointer have cacheline alignment.
+       subs    r2,     r2, #0x3c
+       blo     L_ascendingLengthLessThanSixtyFour
+0:  tst     r0,         #0x1c
+       beq     L_ascendingDestinationCachelineAligned
+       ldr     r3,    [r1],#4
+       subs    r2,         #4
+       str     r3,    [r0],#4
+       bhs     0b
+       b       L_ascendingLengthLessThanSixtyFour
+
+L_ascendingDestinationCachelineAligned:
+//  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
+//  Empirical testing suggests that 0x60 is the optimal lookahead for preload,
+//  though anything between 0x40 and 0x100 seems to be "acceptable".
+       push    ADDITIONAL_CALLEE_SAVE_REGISTERS
+0:     ldm     r1!,    COPY_REGISTERS
+       subs    r2,     r2, #0x40
+       stm     r0!,    COPY_REGISTERS
+       pld    [r1, #0x60]
+       ldm     r1!,    COPY_REGISTERS
+       pld    [r1, #0x60]
+       stm     r0!,    COPY_REGISTERS
+       bhs     0b
+       pop     ADDITIONAL_CALLEE_SAVE_REGISTERS
+
+L_ascendingLengthLessThanSixtyFour:
+//  Cleanup copy of up to 63 bytes.  We can assume that both the source and
+//  destination addresses have word alignment here.
+    tst     r2,         #0x30
+    beq     1f
+0:  ldm     r1!,   {r3,r4,r9,ip}
+    sub     r2,     r2, #0x10
+    stm     r0!,   {r3,r4,r9,ip}
+    tst     r2,         #0x30
+    bne     0b
+1:  tst     r2,         #0xf
+    beq     2f
+    lsls    ip,     r2, #29
+    ldmcs   r1!,   {r3,ip}
+    stmcs   r0!,   {r3,ip}
+    ldrmi   r3,    [r1],#4
+    strmi   r3,    [r0],#4
+       lsls    ip,     r2, #31
+       ldrhcs  r3,    [r1],#2
+       strhcs  r3,    [r0],#2
+       ldrbmi  ip,    [r1]
+       strbmi  ip,    [r0]
+2:  CLEAR_FRAME_AND_RETURN
+
+/*****************************************************************************
+ *  ascending copy, source buffer is not word aligned                        *
+ *****************************************************************************/
+
+L_ascendingUnalignedCopy:
+//  Destination buffer is word aligned, but source buffer is not.  Copy
+//  byte-by-byte until the destination buffer has eightbyte alignment.
+    subs    r2,         #4
+    blo     L_ascendingUnalignedByteCleanup
+0:  tst     r0,         #0x7
+    beq     L_ascendingUnalignedVectorCopy
+    ldrb    r3,    [r1],#1
+    subs    r2,         #1
+    strb    r3,    [r0],#1
+    bhs     0b
+L_ascendingUnalignedByteCleanup:
+    adds    r2,         #8
+    beq     1f
+0:  ldrb    r3,    [r1],#1
+    subs    r2,         #1
+    strb    r3,    [r0],#1
+    bne     0b
+1:  CLEAR_FRAME_AND_RETURN
+    
+L_ascendingUnalignedVectorCopy:
+//  Destination buffer is eightbyte aligned.  Source buffer has unknown
+//  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
+//  up to 24 bytes to get cacheline alignment of the destination buffer.
+    subs    r2,         #0x18
+    blo     L_ascendingUnalignedVectorCleanup
+0:  tst     r0,         #0x18
+    beq     L_ascendingUnalignedCachelineCopy
+    vld1.8 {d0},   [r1]!
+    subs    r2,         #8
+    vst1.8 {d0},   [r0,:64]!
+    bhs     0b
+L_ascendingUnalignedVectorCleanup:
+    adds    r2,         #0x18
+    blo     L_ascendingUnalignedByteCleanup
+0:  vld1.8 {d0},   [r1]!
+    subs    r2,         #8
+    vst1.8 {d0},   [r0,:64]!
+    bhs     0b
+    b       L_ascendingUnalignedByteCleanup
+    
+L_ascendingUnalignedCachelineCopy:
+//  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
+//  of the source address.
+    vld1.8 {q0,q1},[r1]!
+    pld    [r1, #0x60]
+    vst1.8 {q0,q1},[r0,:256]!
+    subs    r2,         #0x20
+    bhs     L_ascendingUnalignedCachelineCopy
+    b       L_ascendingUnalignedVectorCleanup
+
+/*****************************************************************************
+ *  descending copy                                                          *
+ *****************************************************************************/
+
+//  The layout of the two buffers is such that we must copy in descending-
+//  address order.  Throughout this copy, registers are used as follows:
+//
+//      r0  lowest address in the destination buffer that has been written to.
+//      r1  lowest address in the source buffer that has been read from.
+//      r2  number of bytes remaining to copy less an offset that varies
+//          with the size of the copies that are being made.
+//      r3, r4, r5, r6, r8, r9, r10, r12
+//          temporary registers used to hold the data during copies.
+//      r12 also used as a scratch register for alignment / length calculations
+
+L_descendingCopy:
+//  We begin by checking if less than four bytes are to be copied; if so, we
+//  branch directly to a small-buffer copy and return.  Otherwise, we copy up
+//  to three bytes if needed to make the destination pointer have word (four
+//  byte) alignment.
+    add     r1,     r2
+    add     r0,     r2
+    subs    r2,         #4
+       blo     L_descendingLengthLessThanFour
+       ands    ip,     r0, #0x3
+       beq     L_descendingDestinationWordAligned
+       ldrb    r3,    [r1, #-1]!
+       cmp     ip,         #2
+       ldrbhs  r4,    [r1, #-1]!
+       strb    r3,    [r0, #-1]!
+       ldrbhi  r3,    [r1, #-1]!
+       strbhs  r4,    [r0, #-1]!
+       strbhi  r3,    [r0, #-1]!
+       subs    r2,         ip
+       bhs     L_descendingDestinationWordAligned
+        
+L_descendingLengthLessThanFour:
+//  Conditionally copies up to three bytes, assuming no alignment.  This is
+//  only used if the original length of the buffer is smaller than four.
+       lsls    ip,     r2, #31
+       ldrbcs  r3,    [r1, #-1]!
+       ldrbcs  ip,    [r1, #-1]!
+       ldrbmi  r4,    [r1, #-1]
+       strbcs  r3,    [r0, #-1]!
+       strbcs  ip,    [r0, #-1]!
+       strbmi  r4,    [r0, #-1]
+       CLEAR_FRAME_AND_RETURN
+    
+L_descendingDestinationWordAligned:
+//  We know that the destination has word alignment.  If the source is not
+//  similarly aligned, jump to an unaligned copy loop.
+       tst     r1,         #0x3
+       bne             L_descendingUnalignedCopy
+
+/*****************************************************************************
+ *  descending copy, both buffers have word alignment                        *
+ *****************************************************************************/
+    
+//  If less than sixty-four bytes remain to be copied, jump directly to the
+//  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
+//  to make the destination pointer have cacheline alignment.
+       subs    r2,     r2, #0x3c
+       blo     L_descendingLengthLessThanSixtyFour
+0:  tst     r0,         #0x1c
+       beq     L_descendingDestinationCachelineAligned
+       ldr     r3,    [r1, #-4]!
+       subs    r2,         #4
+       str     r3,    [r0, #-4]!
+       bhs     0b
+       b       L_descendingLengthLessThanSixtyFour
+
+L_descendingDestinationCachelineAligned:
+//  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
+//  Empirical testing suggests that -0x80 is the optimal lookahead for preload,
+//  though anything between -0x40 and -0x100 seems to be "acceptable".
+       push    ADDITIONAL_CALLEE_SAVE_REGISTERS
+0:     ldmdb   r1!,    COPY_REGISTERS
+       subs    r2,     r2, #0x40
+       stmdb   r0!,    COPY_REGISTERS
+       pld    [r1, #-0x80]
+       ldmdb   r1!,    COPY_REGISTERS
+       pld    [r1, #-0x80]
+       stmdb   r0!,    COPY_REGISTERS
+       bhs     0b
+       pop     ADDITIONAL_CALLEE_SAVE_REGISTERS
+
+L_descendingLengthLessThanSixtyFour:
+//  Cleanup copy of up to 63 bytes.  We can assume that both the source and
+//  destination addresses have word alignment here.
+    tst     r2,         #0x30
+    beq     1f
+0:  ldmdb   r1!,   {r3,r4,r9,ip}
+    sub     r2,     r2, #0x10
+    stmdb   r0!,   {r3,r4,r9,ip}
+    tst     r2,         #0x30
+    bne     0b
+1:  tst     r2,         #0xf
+    beq     2f
+    lsls    ip,     r2, #29
+    ldmdbcs r1!,   {r3,ip}
+    stmdbcs r0!,   {r3,ip}
+    ldrmi   r3,    [r1, #-4]!
+    strmi   r3,    [r0, #-4]!
+       lsls    ip,     r2, #31
+       ldrhcs  r3,    [r1, #-2]!
+       strhcs  r3,    [r0, #-2]!
+       ldrbmi  ip,    [r1, #-1]
+       strbmi  ip,    [r0, #-1]
+2:  CLEAR_FRAME_AND_RETURN
+
+/*****************************************************************************
+ *  descending copy, source buffer is not word aligned                       *
+ *****************************************************************************/
+
+L_descendingUnalignedCopy:
+//  Destination buffer is word aligned, but source buffer is not.  Copy
+//  byte-by-byte until the destination buffer has eightbyte alignment.
+    subs    r2,         #4
+    blo     L_descendingUnalignedByteCleanup
+0:  tst     r0,         #0x7
+    beq     L_descendingUnalignedVectorCopy
+    ldrb    r3,    [r1, #-1]!
+    subs    r2,         #1
+    strb    r3,    [r0, #-1]!
+    bhs     0b
+L_descendingUnalignedByteCleanup:
+    adds    r2,         #8
+    beq     1f
+0:  ldrb    r3,    [r1, #-1]!
+    subs    r2,         #1
+    strb    r3,    [r0, #-1]!
+    bne     0b
+1:  CLEAR_FRAME_AND_RETURN
+    
+L_descendingUnalignedVectorCopy:
+//  Destination buffer is eightbyte aligned.  Source buffer has unknown
+//  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
+//  up to 24 bytes to get cacheline alignment of the destination buffer.
+    subs    r2,         #0x18
+    blo     L_descendingUnalignedVectorCleanup
+0:  tst     r0,         #0x18
+    beq     L_descendingUnalignedCachelineCopy
+    sub     r1,         #8
+    vld1.8 {d0},   [r1]
+    sub     r0,         #8
+    vst1.8 {d0},   [r0,:64]
+    subs    r2,         #8
+    bhs     0b
+L_descendingUnalignedVectorCleanup:
+    adds    r2,         #0x18
+    blo     L_descendingUnalignedByteCleanup
+0:  sub     r1,         #8
+    vld1.8 {d0},   [r1]
+    sub     r0,         #8
+    vst1.8 {d0},   [r0,:64]
+    subs    r2,         #8
+    bhs     0b
+    b       L_descendingUnalignedByteCleanup
+    
+L_descendingUnalignedCachelineCopy:
+//  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
+//  of the source address.
+    sub     r1,         #32
+    sub     r0,         #32
+    mov     r4,         #-32
+0:  vld1.8 {q0,q1},[r1], r4
+    pld    [r1, #-0x60]
+    vst1.8 {q0,q1},[r0,:256], r4
+    subs    r2,         #0x20
+    bhs     0b
+    add     r1,         #32
+    add     r0,         #32
+    b       L_descendingUnalignedVectorCleanup
+
+#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD