+#include <arm/arch.h>
+#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
+
+/*****************************************************************************
+ * Macros *
+ *****************************************************************************/
+
+#define A9_ENTRY(name) \
+ .align 2;\
+ .globl _ ## name ## $VARIANT$CortexA9;\
+ _ ## name ## $VARIANT$CortexA9:
+
+#define ESTABLISH_FRAME \
+ push {r0,r4,r7,lr};\
+ add r7, sp, #8
+
+#define CLEAR_FRAME_AND_RETURN \
+ pop {r0,r4,r7,pc}
+
+#define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}
+
+#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}
+
+/*****************************************************************************
+ * entry points *
+ *****************************************************************************/
+
+.text
+.syntax unified
+.code 32
+
+A9_ENTRY(bcopy)
+// Translate bcopy calls into memcpy calls by swapping the first and second
+// arguments.
+ mov r3, r0
+ mov r0, r1
+ mov r1, r3
+
+A9_ENTRY(memcpy)
+A9_ENTRY(memmove)
+// Our preference is to copy the data in ascending address order, but if the
+// buffers overlap such that the beginning of the destination buffer aliases
+// the end of the source buffer, we need to copy in descending address order
+// instead to preserve the memmove semantics. We detect this case with the
+// test:
+//
+// destination - source < length (unsigned compare)
+//
+// If the address of the source buffer is higher than the address of the
+// destination buffer, this arithmetic can overflow, but the overflowed value
+// can only be smaller than length if the buffers do not overlap, so we don't
+// need to worry about false positives due to the overflow (they happen, but
+// only in cases where copying in either order is correct).
+ subs r3, r0, r1
+ bxeq lr
+ ESTABLISH_FRAME
+ cmp r3, r2
+ blo L_descendingCopy
+
+/*****************************************************************************
+ * ascending copy *
+ *****************************************************************************/
+
+// The layout of the two buffers is such that we can use our preferred
+// (ascending address order) copy implementation. Throughout this copy,
+// registers are used as follows:
+//
+// r0 lowest unwritten address in the destination buffer.
+// r1 lowest unread address in the source buffer.
+// r2 number of bytes remaining to copy less an offset that varies
+// with the size of the copies that are being made.
+// r3, r4, r5, r6, r8, r9, r10, r12
+// temporary registers used to hold the data during copies.
+// r12 also used as a scratch register for alignment / length calculations
+
+L_ascendingCopy:
+// We begin by checking if less than four bytes are to be copied; if so, we
+// branch directly to a small-buffer copy and return. Otherwise, we copy up
+// to three bytes if needed to make the destination pointer have word (four
+// byte) alignment.
+ subs r2, #4
+ blo L_ascendingLengthLessThanFour
+ ands ip, r0, #0x3
+ beq L_ascendingDestinationWordAligned
+ ldrb r3, [r1],#1
+ cmp ip, #2
+ ldrbls r4, [r1],#1
+ strb r3, [r0],#1
+ ldrblo r3, [r1],#1
+ add r2, ip
+ strbls r4, [r0],#1
+ strblo r3, [r0],#1
+ subs r2, #4
+ bhs L_ascendingDestinationWordAligned
+
+L_ascendingLengthLessThanFour:
+// Conditionally copies up to three bytes, assuming no alignment. This is
+// only used if the original length of the buffer is smaller than four.
+ lsls ip, r2, #31
+ ldrbcs r3, [r1],#1
+ ldrbcs ip, [r1],#1
+ ldrbmi r4, [r1]
+ strbcs r3, [r0],#1
+ strbcs ip, [r0],#1
+ strbmi r4, [r0]
+ CLEAR_FRAME_AND_RETURN
+
+L_ascendingDestinationWordAligned:
+// We know that the destination has word alignment. If the source is not
+// similarly aligned, jump to an unaligned copy loop.
+ tst r1, #0x3
+ bne L_ascendingUnalignedCopy
+
+/*****************************************************************************
+ * ascending copy, both buffers have word alignment *
+ *****************************************************************************/
+
+// If less than sixty-four bytes remain to be copied, jump directly to the
+// word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
+// to make the destination pointer have cacheline alignment.
+ subs r2, r2, #0x3c
+ blo L_ascendingLengthLessThanSixtyFour
+0: tst r0, #0x1c
+ beq L_ascendingDestinationCachelineAligned
+ ldr r3, [r1],#4
+ subs r2, #4
+ str r3, [r0],#4
+ bhs 0b
+ b L_ascendingLengthLessThanSixtyFour
+
+L_ascendingDestinationCachelineAligned:
+// Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
+// Empirical testing suggests that 0x60 is the optimal lookahead for preload,
+// though anything between 0x40 and 0x100 seems to be "acceptable".
+ push ADDITIONAL_CALLEE_SAVE_REGISTERS
+0: ldm r1!, COPY_REGISTERS
+ subs r2, r2, #0x40
+ stm r0!, COPY_REGISTERS
+ pld [r1, #0x60]
+ ldm r1!, COPY_REGISTERS
+ pld [r1, #0x60]
+ stm r0!, COPY_REGISTERS
+ bhs 0b
+ pop ADDITIONAL_CALLEE_SAVE_REGISTERS
+
+L_ascendingLengthLessThanSixtyFour:
+// Cleanup copy of up to 63 bytes. We can assume that both the source and
+// destination addresses have word alignment here.
+ tst r2, #0x30
+ beq 1f
+0: ldm r1!, {r3,r4,r9,ip}
+ sub r2, r2, #0x10
+ stm r0!, {r3,r4,r9,ip}
+ tst r2, #0x30
+ bne 0b
+1: tst r2, #0xf
+ beq 2f
+ lsls ip, r2, #29
+ ldmcs r1!, {r3,ip}
+ stmcs r0!, {r3,ip}
+ ldrmi r3, [r1],#4
+ strmi r3, [r0],#4
+ lsls ip, r2, #31
+ ldrhcs r3, [r1],#2
+ strhcs r3, [r0],#2
+ ldrbmi ip, [r1]
+ strbmi ip, [r0]
+2: CLEAR_FRAME_AND_RETURN
+
+/*****************************************************************************
+ * ascending copy, source buffer is not word aligned *
+ *****************************************************************************/
+
+L_ascendingUnalignedCopy:
+// Destination buffer is word aligned, but source buffer is not. Copy
+// byte-by-byte until the destination buffer has eightbyte alignment.
+ subs r2, #4
+ blo L_ascendingUnalignedByteCleanup
+0: tst r0, #0x7
+ beq L_ascendingUnalignedVectorCopy
+ ldrb r3, [r1],#1
+ subs r2, #1
+ strb r3, [r0],#1
+ bhs 0b
+L_ascendingUnalignedByteCleanup:
+ adds r2, #8
+ beq 1f
+0: ldrb r3, [r1],#1
+ subs r2, #1
+ strb r3, [r0],#1
+ bne 0b
+1: CLEAR_FRAME_AND_RETURN
+
+L_ascendingUnalignedVectorCopy:
+// Destination buffer is eightbyte aligned. Source buffer has unknown
+// alignment. Use NEON to handle the misaligned copies. We begin by copying
+// up to 24 bytes to get cacheline alignment of the destination buffer.
+ subs r2, #0x18
+ blo L_ascendingUnalignedVectorCleanup
+0: tst r0, #0x18
+ beq L_ascendingUnalignedCachelineCopy
+ vld1.8 {d0}, [r1]!
+ subs r2, #8
+ vst1.8 {d0}, [r0,:64]!
+ bhs 0b
+L_ascendingUnalignedVectorCleanup:
+ adds r2, #0x18
+ blo L_ascendingUnalignedByteCleanup
+0: vld1.8 {d0}, [r1]!
+ subs r2, #8
+ vst1.8 {d0}, [r0,:64]!
+ bhs 0b
+ b L_ascendingUnalignedByteCleanup
+
+L_ascendingUnalignedCachelineCopy:
+// Main copy loop; moves 32 bytes per iteration. Requires only byte alignment
+// of the source address.
+ vld1.8 {q0,q1},[r1]!
+ pld [r1, #0x60]
+ vst1.8 {q0,q1},[r0,:256]!
+ subs r2, #0x20
+ bhs L_ascendingUnalignedCachelineCopy
+ b L_ascendingUnalignedVectorCleanup
+
+/*****************************************************************************
+ * descending copy *
+ *****************************************************************************/
+
+// The layout of the two buffers is such that we must copy in descending-
+// address order. Throughout this copy, registers are used as follows:
+//
+// r0 lowest address in the destination buffer that has been written to.
+// r1 lowest address in the source buffer that has been read from.
+// r2 number of bytes remaining to copy less an offset that varies
+// with the size of the copies that are being made.
+// r3, r4, r5, r6, r8, r9, r10, r12
+// temporary registers used to hold the data during copies.
+// r12 also used as a scratch register for alignment / length calculations
+
+L_descendingCopy:
+// We begin by checking if less than four bytes are to be copied; if so, we
+// branch directly to a small-buffer copy and return. Otherwise, we copy up
+// to three bytes if needed to make the destination pointer have word (four
+// byte) alignment.
+ add r1, r2
+ add r0, r2
+ subs r2, #4
+ blo L_descendingLengthLessThanFour
+ ands ip, r0, #0x3
+ beq L_descendingDestinationWordAligned
+ ldrb r3, [r1, #-1]!
+ cmp ip, #2
+ ldrbhs r4, [r1, #-1]!
+ strb r3, [r0, #-1]!
+ ldrbhi r3, [r1, #-1]!
+ strbhs r4, [r0, #-1]!
+ strbhi r3, [r0, #-1]!
+ subs r2, ip
+ bhs L_descendingDestinationWordAligned
+
+L_descendingLengthLessThanFour:
+// Conditionally copies up to three bytes, assuming no alignment. This is
+// only used if the original length of the buffer is smaller than four.
+ lsls ip, r2, #31
+ ldrbcs r3, [r1, #-1]!
+ ldrbcs ip, [r1, #-1]!
+ ldrbmi r4, [r1, #-1]
+ strbcs r3, [r0, #-1]!
+ strbcs ip, [r0, #-1]!
+ strbmi r4, [r0, #-1]
+ CLEAR_FRAME_AND_RETURN
+
+L_descendingDestinationWordAligned:
+// We know that the destination has word alignment. If the source is not
+// similarly aligned, jump to an unaligned copy loop.
+ tst r1, #0x3
+ bne L_descendingUnalignedCopy
+
+/*****************************************************************************
+ * descending copy, both buffers have word alignment *
+ *****************************************************************************/
+
+// If less than sixty-four bytes remain to be copied, jump directly to the
+// word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
+// to make the destination pointer have cacheline alignment.
+ subs r2, r2, #0x3c
+ blo L_descendingLengthLessThanSixtyFour
+0: tst r0, #0x1c
+ beq L_descendingDestinationCachelineAligned
+ ldr r3, [r1, #-4]!
+ subs r2, #4
+ str r3, [r0, #-4]!
+ bhs 0b
+ b L_descendingLengthLessThanSixtyFour
+
+L_descendingDestinationCachelineAligned:
+// Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
+// Empirical testing suggests that -0x80 is the optimal lookahead for preload,
+// though anything between -0x40 and -0x100 seems to be "acceptable".
+ push ADDITIONAL_CALLEE_SAVE_REGISTERS
+0: ldmdb r1!, COPY_REGISTERS
+ subs r2, r2, #0x40
+ stmdb r0!, COPY_REGISTERS
+ pld [r1, #-0x80]
+ ldmdb r1!, COPY_REGISTERS
+ pld [r1, #-0x80]
+ stmdb r0!, COPY_REGISTERS
+ bhs 0b
+ pop ADDITIONAL_CALLEE_SAVE_REGISTERS
+
+L_descendingLengthLessThanSixtyFour:
+// Cleanup copy of up to 63 bytes. We can assume that both the source and
+// destination addresses have word alignment here.
+ tst r2, #0x30
+ beq 1f
+0: ldmdb r1!, {r3,r4,r9,ip}
+ sub r2, r2, #0x10
+ stmdb r0!, {r3,r4,r9,ip}
+ tst r2, #0x30
+ bne 0b
+1: tst r2, #0xf
+ beq 2f
+ lsls ip, r2, #29
+ ldmdbcs r1!, {r3,ip}
+ stmdbcs r0!, {r3,ip}
+ ldrmi r3, [r1, #-4]!
+ strmi r3, [r0, #-4]!
+ lsls ip, r2, #31
+ ldrhcs r3, [r1, #-2]!
+ strhcs r3, [r0, #-2]!
+ ldrbmi ip, [r1, #-1]
+ strbmi ip, [r0, #-1]
+2: CLEAR_FRAME_AND_RETURN
+
+/*****************************************************************************
+ * descending copy, source buffer is not word aligned *
+ *****************************************************************************/
+
+L_descendingUnalignedCopy:
+// Destination buffer is word aligned, but source buffer is not. Copy
+// byte-by-byte until the destination buffer has eightbyte alignment.
+ subs r2, #4
+ blo L_descendingUnalignedByteCleanup
+0: tst r0, #0x7
+ beq L_descendingUnalignedVectorCopy
+ ldrb r3, [r1, #-1]!
+ subs r2, #1
+ strb r3, [r0, #-1]!
+ bhs 0b
+L_descendingUnalignedByteCleanup:
+ adds r2, #8
+ beq 1f
+0: ldrb r3, [r1, #-1]!
+ subs r2, #1
+ strb r3, [r0, #-1]!
+ bne 0b
+1: CLEAR_FRAME_AND_RETURN
+
+L_descendingUnalignedVectorCopy:
+// Destination buffer is eightbyte aligned. Source buffer has unknown
+// alignment. Use NEON to handle the misaligned copies. We begin by copying
+// up to 24 bytes to get cacheline alignment of the destination buffer.
+ subs r2, #0x18
+ blo L_descendingUnalignedVectorCleanup
+0: tst r0, #0x18
+ beq L_descendingUnalignedCachelineCopy
+ sub r1, #8
+ vld1.8 {d0}, [r1]
+ sub r0, #8
+ vst1.8 {d0}, [r0,:64]
+ subs r2, #8
+ bhs 0b
+L_descendingUnalignedVectorCleanup:
+ adds r2, #0x18
+ blo L_descendingUnalignedByteCleanup
+0: sub r1, #8
+ vld1.8 {d0}, [r1]
+ sub r0, #8
+ vst1.8 {d0}, [r0,:64]
+ subs r2, #8
+ bhs 0b
+ b L_descendingUnalignedByteCleanup
+
+L_descendingUnalignedCachelineCopy:
+// Main copy loop; moves 32 bytes per iteration. Requires only byte alignment
+// of the source address.
+ sub r1, #32
+ sub r0, #32
+ mov r4, #-32
+0: vld1.8 {q0,q1},[r1], r4
+ pld [r1, #-0x60]
+ vst1.8 {q0,q1},[r0,:256], r4
+ subs r2, #0x20
+ bhs 0b
+ add r1, #32
+ add r0, #32
+ b L_descendingUnalignedVectorCleanup
+
+#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD