[apple/libc.git] / arm / string / bcopy_CortexA9.s

/*
 * Copyright (c) 2010 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 *
 *  This file implements the following functions for the Cortex-A9 processor:
 *
 *  void bcopy(const void * source,
 *             void * destination,
 *             size_t length);
 *
 *  void *memmove(void * destination,
 *                const void * source,
 *                size_t n);
 *
 *  void *memcpy(void * restrict destination,
 *               const void * restrict source,
 *               size_t n);
 *
 * All copy n successive bytes from source to destination.  Memmove and memcpy
 * return destination, whereas bcopy has no return value.  Copying takes place
 * as if it were through a temporary buffer -- after return destination
 * contains exactly the bytes from source, even if the buffers overlap (this is
 * not required of memcpy by the C standard; its behavior is undefined if the
 * buffers overlap, but we are holding ourselves to the historical behavior of
 * this function on OS X and iOS).
 */

#include <arm/arch.h>
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD

/*****************************************************************************
 * Macros                                                                    *
 *****************************************************************************/

#define A9_ENTRY(name) \
	.align 2;\
	.globl _ ## name ## $VARIANT$CortexA9;\
	_ ## name ## $VARIANT$CortexA9:

#define ESTABLISH_FRAME \
	push   {r0,r4,r7,lr};\
	add     r7,     sp, #8
    
#define CLEAR_FRAME_AND_RETURN \
	pop    {r0,r4,r7,pc}
    
#define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}

#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}

/*****************************************************************************
 *  entry points                                                             *
 *****************************************************************************/

.text
.syntax unified
.code 32

A9_ENTRY(bcopy)
//  Translate bcopy calls into memcpy calls by swapping the first and second
//  arguments.
	mov     r3,     r0
	mov     r0,     r1
	mov     r1,     r3

A9_ENTRY(memcpy)
A9_ENTRY(memmove)
//  Our preference is to copy the data in ascending address order, but if the
//  buffers overlap such that the beginning of the destination buffer aliases
//  the end of the source buffer, we need to copy in descending address order
//  instead to preserve the memmove semantics.  We detect this case with the
//  test:
//
//      destination - source < length    (unsigned compare)
//
//  If the address of the source buffer is higher than the address of the
//  destination buffer, this arithmetic can overflow, but the overflowed value
//  can only be smaller than length if the buffers do not overlap, so we don't
//  need to worry about false positives due to the overflow (they happen, but
//  only in cases where copying in either order is correct).
	subs    r3,     r0, r1
	bxeq    lr
	ESTABLISH_FRAME
	cmp     r3,     r2
	blo     L_descendingCopy

/*****************************************************************************
 *  ascending copy                                                           *
 *****************************************************************************/

//  The layout of the two buffers is such that we can use our preferred
//  (ascending address order) copy implementation.  Throughout this copy,
//  registers are used as follows:
//
//      r0  lowest unwritten address in the destination buffer.
//      r1  lowest unread address in the source buffer.
//      r2  number of bytes remaining to copy less an offset that varies
//          with the size of the copies that are being made.
//      r3, r4, r5, r6, r8, r9, r10, r12
//          temporary registers used to hold the data during copies.
//      r12 also used as a scratch register for alignment / length calculations

L_ascendingCopy:
//  We begin by checking if less than four bytes are to be copied; if so, we
//  branch directly to a small-buffer copy and return.  Otherwise, we copy up
//  to three bytes if needed to make the destination pointer have word (four
//  byte) alignment.
	subs    r2,         #4
	blo     L_ascendingLengthLessThanFour
	ands    ip,     r0, #0x3
	beq     L_ascendingDestinationWordAligned
	ldrb    r3,    [r1],#1
	cmp     ip,         #2
	ldrbls  r4,    [r1],#1
	strb    r3,    [r0],#1
	ldrblo  r3,    [r1],#1
	add     r2,         ip
	strbls  r4,    [r0],#1
	strblo  r3,    [r0],#1
	subs    r2,         #4
	bhs     L_ascendingDestinationWordAligned
    
L_ascendingLengthLessThanFour:
//  Conditionally copies up to three bytes, assuming no alignment.  This is
//  only used if the original length of the buffer is smaller than four.
	lsls    ip,     r2, #31
	ldrbcs  r3,    [r1],#1
	ldrbcs  ip,    [r1],#1
	ldrbmi  r4,    [r1]
	strbcs  r3,    [r0],#1
	strbcs  ip,    [r0],#1
	strbmi  r4,    [r0]
	CLEAR_FRAME_AND_RETURN
    
L_ascendingDestinationWordAligned:
//  We know that the destination has word alignment.  If the source is not
//  similarly aligned, jump to an unaligned copy loop.
	tst     r1,         #0x3
	bne		L_ascendingUnalignedCopy

/*****************************************************************************
 *  ascending copy, both buffers have word alignment                         *
 *****************************************************************************/
    
//  If less than sixty-four bytes remain to be copied, jump directly to the
//  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
//  to make the destination pointer have cacheline alignment.
	subs    r2,     r2, #0x3c
	blo     L_ascendingLengthLessThanSixtyFour
0:  tst     r0,         #0x1c
	beq     L_ascendingDestinationCachelineAligned
	ldr     r3,    [r1],#4
	subs    r2,         #4
	str     r3,    [r0],#4
	bhs     0b
	b       L_ascendingLengthLessThanSixtyFour

L_ascendingDestinationCachelineAligned:
//  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
//  Empirical testing suggests that 0x60 is the optimal lookahead for preload,
//  though anything between 0x40 and 0x100 seems to be "acceptable".
	push    ADDITIONAL_CALLEE_SAVE_REGISTERS
0:	ldm     r1!,    COPY_REGISTERS
	subs    r2,     r2, #0x40
	stm     r0!,    COPY_REGISTERS
	pld    [r1, #0x60]
	ldm     r1!,    COPY_REGISTERS
	pld    [r1, #0x60]
	stm     r0!,    COPY_REGISTERS
	bhs     0b
	pop     ADDITIONAL_CALLEE_SAVE_REGISTERS

L_ascendingLengthLessThanSixtyFour:
//  Cleanup copy of up to 63 bytes.  We can assume that both the source and
//  destination addresses have word alignment here.
    tst     r2,         #0x30
    beq     1f
0:  ldm     r1!,   {r3,r4,r9,ip}
    sub     r2,     r2, #0x10
    stm     r0!,   {r3,r4,r9,ip}
    tst     r2,         #0x30
    bne     0b
1:  tst     r2,         #0xf
    beq     2f
    lsls    ip,     r2, #29
    ldmcs   r1!,   {r3,ip}
    stmcs   r0!,   {r3,ip}
    ldrmi   r3,    [r1],#4
    strmi   r3,    [r0],#4
	lsls    ip,     r2, #31
	ldrhcs  r3,    [r1],#2
	strhcs  r3,    [r0],#2
	ldrbmi  ip,    [r1]
	strbmi  ip,    [r0]
2:  CLEAR_FRAME_AND_RETURN

/*****************************************************************************
 *  ascending copy, source buffer is not word aligned                        *
 *****************************************************************************/

L_ascendingUnalignedCopy:
//  Destination buffer is word aligned, but source buffer is not.  Copy
//  byte-by-byte until the destination buffer has eightbyte alignment.
    subs    r2,         #4
    blo     L_ascendingUnalignedByteCleanup
0:  tst     r0,         #0x7
    beq     L_ascendingUnalignedVectorCopy
    ldrb    r3,    [r1],#1
    subs    r2,         #1
    strb    r3,    [r0],#1
    bhs     0b
L_ascendingUnalignedByteCleanup:
    adds    r2,         #8
    beq     1f
0:  ldrb    r3,    [r1],#1
    subs    r2,         #1
    strb    r3,    [r0],#1
    bne     0b
1:  CLEAR_FRAME_AND_RETURN
    
L_ascendingUnalignedVectorCopy:
//  Destination buffer is eightbyte aligned.  Source buffer has unknown
//  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
//  up to 24 bytes to get cacheline alignment of the destination buffer.
    subs    r2,         #0x18
    blo     L_ascendingUnalignedVectorCleanup
0:  tst     r0,         #0x18
    beq     L_ascendingUnalignedCachelineCopy
    vld1.8 {d0},   [r1]!
    subs    r2,         #8
    vst1.8 {d0},   [r0,:64]!
    bhs     0b
L_ascendingUnalignedVectorCleanup:
    adds    r2,         #0x18
    blo     L_ascendingUnalignedByteCleanup
0:  vld1.8 {d0},   [r1]!
    subs    r2,         #8
    vst1.8 {d0},   [r0,:64]!
    bhs     0b
    b       L_ascendingUnalignedByteCleanup
    
L_ascendingUnalignedCachelineCopy:
//  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
//  of the source address.
    vld1.8 {q0,q1},[r1]!
    pld    [r1, #0x60]
    vst1.8 {q0,q1},[r0,:256]!
    subs    r2,         #0x20
    bhs     L_ascendingUnalignedCachelineCopy
    b       L_ascendingUnalignedVectorCleanup

/*****************************************************************************
 *  descending copy                                                          *
 *****************************************************************************/

//  The layout of the two buffers is such that we must copy in descending-
//  address order.  Throughout this copy, registers are used as follows:
//
//      r0  lowest address in the destination buffer that has been written to.
//      r1  lowest address in the source buffer that has been read from.
//      r2  number of bytes remaining to copy less an offset that varies
//          with the size of the copies that are being made.
//      r3, r4, r5, r6, r8, r9, r10, r12
//          temporary registers used to hold the data during copies.
//      r12 also used as a scratch register for alignment / length calculations

L_descendingCopy:
//  We begin by checking if less than four bytes are to be copied; if so, we
//  branch directly to a small-buffer copy and return.  Otherwise, we copy up
//  to three bytes if needed to make the destination pointer have word (four
//  byte) alignment.
    add     r1,     r2
    add     r0,     r2
    subs    r2,         #4
	blo     L_descendingLengthLessThanFour
	ands    ip,     r0, #0x3
	beq     L_descendingDestinationWordAligned
	ldrb    r3,    [r1, #-1]!
	cmp     ip,         #2
	ldrbhs  r4,    [r1, #-1]!
	strb    r3,    [r0, #-1]!
	ldrbhi  r3,    [r1, #-1]!
	strbhs  r4,    [r0, #-1]!
	strbhi  r3,    [r0, #-1]!
	subs    r2,         ip
	bhs     L_descendingDestinationWordAligned
        
L_descendingLengthLessThanFour:
//  Conditionally copies up to three bytes, assuming no alignment.  This is
//  only used if the original length of the buffer is smaller than four.
	lsls    ip,     r2, #31
	ldrbcs  r3,    [r1, #-1]!
	ldrbcs  ip,    [r1, #-1]!
	ldrbmi  r4,    [r1, #-1]
	strbcs  r3,    [r0, #-1]!
	strbcs  ip,    [r0, #-1]!
	strbmi  r4,    [r0, #-1]
	CLEAR_FRAME_AND_RETURN
    
L_descendingDestinationWordAligned:
//  We know that the destination has word alignment.  If the source is not
//  similarly aligned, jump to an unaligned copy loop.
	tst     r1,         #0x3
	bne		L_descendingUnalignedCopy

/*****************************************************************************
 *  descending copy, both buffers have word alignment                        *
 *****************************************************************************/
    
//  If less than sixty-four bytes remain to be copied, jump directly to the
//  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
//  to make the destination pointer have cacheline alignment.
	subs    r2,     r2, #0x3c
	blo     L_descendingLengthLessThanSixtyFour
0:  tst     r0,         #0x1c
	beq     L_descendingDestinationCachelineAligned
	ldr     r3,    [r1, #-4]!
	subs    r2,         #4
	str     r3,    [r0, #-4]!
	bhs     0b
	b       L_descendingLengthLessThanSixtyFour

L_descendingDestinationCachelineAligned:
//  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
//  Empirical testing suggests that -0x80 is the optimal lookahead for preload,
//  though anything between -0x40 and -0x100 seems to be "acceptable".
	push    ADDITIONAL_CALLEE_SAVE_REGISTERS
0:	ldmdb   r1!,    COPY_REGISTERS
	subs    r2,     r2, #0x40
	stmdb   r0!,    COPY_REGISTERS
	pld    [r1, #-0x80]
	ldmdb   r1!,    COPY_REGISTERS
	pld    [r1, #-0x80]
	stmdb   r0!,    COPY_REGISTERS
	bhs     0b
	pop     ADDITIONAL_CALLEE_SAVE_REGISTERS

L_descendingLengthLessThanSixtyFour:
//  Cleanup copy of up to 63 bytes.  We can assume that both the source and
//  destination addresses have word alignment here.
    tst     r2,         #0x30
    beq     1f
0:  ldmdb   r1!,   {r3,r4,r9,ip}
    sub     r2,     r2, #0x10
    stmdb   r0!,   {r3,r4,r9,ip}
    tst     r2,         #0x30
    bne     0b
1:  tst     r2,         #0xf
    beq     2f
    lsls    ip,     r2, #29
    ldmdbcs r1!,   {r3,ip}
    stmdbcs r0!,   {r3,ip}
    ldrmi   r3,    [r1, #-4]!
    strmi   r3,    [r0, #-4]!
	lsls    ip,     r2, #31
	ldrhcs  r3,    [r1, #-2]!
	strhcs  r3,    [r0, #-2]!
	ldrbmi  ip,    [r1, #-1]
	strbmi  ip,    [r0, #-1]
2:  CLEAR_FRAME_AND_RETURN

/*****************************************************************************
 *  descending copy, source buffer is not word aligned                       *
 *****************************************************************************/

L_descendingUnalignedCopy:
//  Destination buffer is word aligned, but source buffer is not.  Copy
//  byte-by-byte until the destination buffer has eightbyte alignment.
    subs    r2,         #4
    blo     L_descendingUnalignedByteCleanup
0:  tst     r0,         #0x7
    beq     L_descendingUnalignedVectorCopy
    ldrb    r3,    [r1, #-1]!
    subs    r2,         #1
    strb    r3,    [r0, #-1]!
    bhs     0b
L_descendingUnalignedByteCleanup:
    adds    r2,         #8
    beq     1f
0:  ldrb    r3,    [r1, #-1]!
    subs    r2,         #1
    strb    r3,    [r0, #-1]!
    bne     0b
1:  CLEAR_FRAME_AND_RETURN
    
L_descendingUnalignedVectorCopy:
//  Destination buffer is eightbyte aligned.  Source buffer has unknown
//  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
//  up to 24 bytes to get cacheline alignment of the destination buffer.
    subs    r2,         #0x18
    blo     L_descendingUnalignedVectorCleanup
0:  tst     r0,         #0x18
    beq     L_descendingUnalignedCachelineCopy
    sub     r1,         #8
    vld1.8 {d0},   [r1]
    sub     r0,         #8
    vst1.8 {d0},   [r0,:64]
    subs    r2,         #8
    bhs     0b
L_descendingUnalignedVectorCleanup:
    adds    r2,         #0x18
    blo     L_descendingUnalignedByteCleanup
0:  sub     r1,         #8
    vld1.8 {d0},   [r1]
    sub     r0,         #8
    vst1.8 {d0},   [r0,:64]
    subs    r2,         #8
    bhs     0b
    b       L_descendingUnalignedByteCleanup
    
L_descendingUnalignedCachelineCopy:
//  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
//  of the source address.
    sub     r1,         #32
    sub     r0,         #32
    mov     r4,         #-32
0:  vld1.8 {q0,q1},[r1], r4
    pld    [r1, #-0x60]
    vst1.8 {q0,q1},[r0,:256], r4
    subs    r2,         #0x20
    bhs     0b
    add     r1,         #32
    add     r0,         #32
    b       L_descendingUnalignedVectorCleanup

#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD
Commit	Line	Data
7b00c0c4 A	1	/*
	2	* Copyright (c) 2010 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*
	23	* This file implements the following functions for the Cortex-A9 processor:
	24	*
	25	* void bcopy(const void * source,
	26	* void * destination,
	27	* size_t length);
	28	*
	29	* void memmove(void destination,
	30	* const void * source,
	31	* size_t n);
	32	*
	33	* void memcpy(void restrict destination,
	34	* const void * restrict source,
	35	* size_t n);
	36	*
	37	* All copy n successive bytes from source to destination. Memmove and memcpy
	38	* return destination, whereas bcopy has no return value. Copying takes place
	39	* as if it were through a temporary buffer -- after return destination
	40	* contains exactly the bytes from source, even if the buffers overlap (this is
	41	* not required of memcpy by the C standard; its behavior is undefined if the
	42	* buffers overlap, but we are holding ourselves to the historical behavior of
	43	* this function on OS X and iOS).
	44	*/
	45
ad3c9f2a A	46	#include <arm/arch.h>
	47	#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
	48
	49	/*****************************************************************************
	50	* Macros *
	51	*****************************************************************************/
	52
	53	#define A9_ENTRY(name) \
	54	.align 2;\
	55	.globl _ ## name ## $VARIANT$CortexA9;\
	56	_ ## name ## $VARIANT$CortexA9:
	57
	58	#define ESTABLISH_FRAME \
	59	push {r0,r4,r7,lr};\
	60	add r7, sp, #8
	61
	62	#define CLEAR_FRAME_AND_RETURN \
	63	pop {r0,r4,r7,pc}
	64
	65	#define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}
	66
	67	#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}
	68
	69	/*****************************************************************************
	70	* entry points *
	71	*****************************************************************************/
	72
	73	.text
	74	.syntax unified
	75	.code 32
	76
	77	A9_ENTRY(bcopy)
	78	// Translate bcopy calls into memcpy calls by swapping the first and second
	79	// arguments.
	80	mov r3, r0
	81	mov r0, r1
	82	mov r1, r3
	83
	84	A9_ENTRY(memcpy)
	85	A9_ENTRY(memmove)
	86	// Our preference is to copy the data in ascending address order, but if the
	87	// buffers overlap such that the beginning of the destination buffer aliases
	88	// the end of the source buffer, we need to copy in descending address order
	89	// instead to preserve the memmove semantics. We detect this case with the
	90	// test:
	91	//
	92	// destination - source < length (unsigned compare)
	93	//
	94	// If the address of the source buffer is higher than the address of the
	95	// destination buffer, this arithmetic can overflow, but the overflowed value
	96	// can only be smaller than length if the buffers do not overlap, so we don't
	97	// need to worry about false positives due to the overflow (they happen, but
	98	// only in cases where copying in either order is correct).
	99	subs r3, r0, r1
	100	bxeq lr
	101	ESTABLISH_FRAME
	102	cmp r3, r2
	103	blo L_descendingCopy
	104
	105	/*****************************************************************************
	106	* ascending copy *
	107	*****************************************************************************/
	108
	109	// The layout of the two buffers is such that we can use our preferred
110	// (ascending address order) copy implementation. Throughout this copy,
111	// registers are used as follows:
112	//
113	// r0 lowest unwritten address in the destination buffer.
114	// r1 lowest unread address in the source buffer.
115	// r2 number of bytes remaining to copy less an offset that varies
116	// with the size of the copies that are being made.
117	// r3, r4, r5, r6, r8, r9, r10, r12
118	// temporary registers used to hold the data during copies.
119	// r12 also used as a scratch register for alignment / length calculations
120
121	L_ascendingCopy:
122	// We begin by checking if less than four bytes are to be copied; if so, we
123	// branch directly to a small-buffer copy and return. Otherwise, we copy up
124	// to three bytes if needed to make the destination pointer have word (four
125	// byte) alignment.
126	subs r2, #4
127	blo L_ascendingLengthLessThanFour
128	ands ip, r0, #0x3
129	beq L_ascendingDestinationWordAligned
130	ldrb r3, [r1],#1
131	cmp ip, #2
132	ldrbls r4, [r1],#1
133	strb r3, [r0],#1
134	ldrblo r3, [r1],#1
135	add r2, ip
136	strbls r4, [r0],#1
137	strblo r3, [r0],#1
138	subs r2, #4
139	bhs L_ascendingDestinationWordAligned
140
141	L_ascendingLengthLessThanFour:
142	// Conditionally copies up to three bytes, assuming no alignment. This is
143	// only used if the original length of the buffer is smaller than four.
144	lsls ip, r2, #31
145	ldrbcs r3, [r1],#1
146	ldrbcs ip, [r1],#1
147	ldrbmi r4, [r1]
148	strbcs r3, [r0],#1
149	strbcs ip, [r0],#1
150	strbmi r4, [r0]
151	CLEAR_FRAME_AND_RETURN
152
153	L_ascendingDestinationWordAligned:
154	// We know that the destination has word alignment. If the source is not
155	// similarly aligned, jump to an unaligned copy loop.
156	tst r1, #0x3
157	bne L_ascendingUnalignedCopy
158
159	/*****************************************************************************
160	* ascending copy, both buffers have word alignment *
161	*****************************************************************************/
162
163	// If less than sixty-four bytes remain to be copied, jump directly to the
164	// word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
165	// to make the destination pointer have cacheline alignment.
166	subs r2, r2, #0x3c
167	blo L_ascendingLengthLessThanSixtyFour
168	0: tst r0, #0x1c
169	beq L_ascendingDestinationCachelineAligned
170	ldr r3, [r1],#4
171	subs r2, #4
172	str r3, [r0],#4
173	bhs 0b
174	b L_ascendingLengthLessThanSixtyFour
175
176	L_ascendingDestinationCachelineAligned:
177	// Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
178	// Empirical testing suggests that 0x60 is the optimal lookahead for preload,
179	// though anything between 0x40 and 0x100 seems to be "acceptable".
180	push ADDITIONAL_CALLEE_SAVE_REGISTERS
181	0: ldm r1!, COPY_REGISTERS
182	subs r2, r2, #0x40
183	stm r0!, COPY_REGISTERS
184	pld [r1, #0x60]
185	ldm r1!, COPY_REGISTERS
186	pld [r1, #0x60]
187	stm r0!, COPY_REGISTERS
188	bhs 0b
189	pop ADDITIONAL_CALLEE_SAVE_REGISTERS
190
191	L_ascendingLengthLessThanSixtyFour:
192	// Cleanup copy of up to 63 bytes. We can assume that both the source and
193	// destination addresses have word alignment here.
194	tst r2, #0x30
195	beq 1f
196	0: ldm r1!, {r3,r4,r9,ip}
197	sub r2, r2, #0x10
198	stm r0!, {r3,r4,r9,ip}
199	tst r2, #0x30
200	bne 0b
201	1: tst r2, #0xf
202	beq 2f
203	lsls ip, r2, #29
204	ldmcs r1!, {r3,ip}
205	stmcs r0!, {r3,ip}
206	ldrmi r3, [r1],#4
207	strmi r3, [r0],#4
208	lsls ip, r2, #31
209	ldrhcs r3, [r1],#2
210	strhcs r3, [r0],#2
211	ldrbmi ip, [r1]
212	strbmi ip, [r0]
213	2: CLEAR_FRAME_AND_RETURN
214
215	/*****************************************************************************
216	* ascending copy, source buffer is not word aligned *
217	*****************************************************************************/
218
219	L_ascendingUnalignedCopy:
220	// Destination buffer is word aligned, but source buffer is not. Copy
221	// byte-by-byte until the destination buffer has eightbyte alignment.
222	subs r2, #4
223	blo L_ascendingUnalignedByteCleanup
224	0: tst r0, #0x7
225	beq L_ascendingUnalignedVectorCopy
226	ldrb r3, [r1],#1
227	subs r2, #1
228	strb r3, [r0],#1
229	bhs 0b
230	L_ascendingUnalignedByteCleanup:
231	adds r2, #8
232	beq 1f
233	0: ldrb r3, [r1],#1
234	subs r2, #1
235	strb r3, [r0],#1
236	bne 0b
237	1: CLEAR_FRAME_AND_RETURN
238
239	L_ascendingUnalignedVectorCopy:
240	// Destination buffer is eightbyte aligned. Source buffer has unknown
241	// alignment. Use NEON to handle the misaligned copies. We begin by copying
242	// up to 24 bytes to get cacheline alignment of the destination buffer.
243	subs r2, #0x18
244	blo L_ascendingUnalignedVectorCleanup
245	0: tst r0, #0x18
246	beq L_ascendingUnalignedCachelineCopy
247	vld1.8 {d0}, [r1]!
248	subs r2, #8
249	vst1.8 {d0}, [r0,:64]!
250	bhs 0b
251	L_ascendingUnalignedVectorCleanup:
252	adds r2, #0x18
253	blo L_ascendingUnalignedByteCleanup
254	0: vld1.8 {d0}, [r1]!
255	subs r2, #8
256	vst1.8 {d0}, [r0,:64]!
257	bhs 0b
258	b L_ascendingUnalignedByteCleanup
259
260	L_ascendingUnalignedCachelineCopy:
261	// Main copy loop; moves 32 bytes per iteration. Requires only byte alignment
262	// of the source address.
263	vld1.8 {q0,q1},[r1]!
264	pld [r1, #0x60]
265	vst1.8 {q0,q1},[r0,:256]!
266	subs r2, #0x20
267	bhs L_ascendingUnalignedCachelineCopy
268	b L_ascendingUnalignedVectorCleanup
269
270	/*****************************************************************************
271	* descending copy *
272	*****************************************************************************/
273
274	// The layout of the two buffers is such that we must copy in descending-
275	// address order. Throughout this copy, registers are used as follows:
276	//
277	// r0 lowest address in the destination buffer that has been written to.
278	// r1 lowest address in the source buffer that has been read from.
279	// r2 number of bytes remaining to copy less an offset that varies
280	// with the size of the copies that are being made.
281	// r3, r4, r5, r6, r8, r9, r10, r12
282	// temporary registers used to hold the data during copies.
283	// r12 also used as a scratch register for alignment / length calculations
284
285	L_descendingCopy:
286	// We begin by checking if less than four bytes are to be copied; if so, we
287	// branch directly to a small-buffer copy and return. Otherwise, we copy up
288	// to three bytes if needed to make the destination pointer have word (four
289	// byte) alignment.
290	add r1, r2
291	add r0, r2
292	subs r2, #4
293	blo L_descendingLengthLessThanFour
294	ands ip, r0, #0x3
295	beq L_descendingDestinationWordAligned
296	ldrb r3, [r1, #-1]!
297	cmp ip, #2
298	ldrbhs r4, [r1, #-1]!
299	strb r3, [r0, #-1]!
300	ldrbhi r3, [r1, #-1]!
301	strbhs r4, [r0, #-1]!
302	strbhi r3, [r0, #-1]!
303	subs r2, ip
304	bhs L_descendingDestinationWordAligned
305
306	L_descendingLengthLessThanFour:
307	// Conditionally copies up to three bytes, assuming no alignment. This is
308	// only used if the original length of the buffer is smaller than four.
309	lsls ip, r2, #31
310	ldrbcs r3, [r1, #-1]!
311	ldrbcs ip, [r1, #-1]!
312	ldrbmi r4, [r1, #-1]
313	strbcs r3, [r0, #-1]!
314	strbcs ip, [r0, #-1]!
315	strbmi r4, [r0, #-1]
316	CLEAR_FRAME_AND_RETURN
317
318	L_descendingDestinationWordAligned:
319	// We know that the destination has word alignment. If the source is not
320	// similarly aligned, jump to an unaligned copy loop.
321	tst r1, #0x3
322	bne L_descendingUnalignedCopy
323
324	/*****************************************************************************
325	* descending copy, both buffers have word alignment *
326	*****************************************************************************/
327
328	// If less than sixty-four bytes remain to be copied, jump directly to the
329	// word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
330	// to make the destination pointer have cacheline alignment.
331	subs r2, r2, #0x3c
332	blo L_descendingLengthLessThanSixtyFour
333	0: tst r0, #0x1c
334	beq L_descendingDestinationCachelineAligned
335	ldr r3, [r1, #-4]!
336	subs r2, #4
337	str r3, [r0, #-4]!
338	bhs 0b
339	b L_descendingLengthLessThanSixtyFour
340
341	L_descendingDestinationCachelineAligned:
342	// Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
343	// Empirical testing suggests that -0x80 is the optimal lookahead for preload,
344	// though anything between -0x40 and -0x100 seems to be "acceptable".
345	push ADDITIONAL_CALLEE_SAVE_REGISTERS
346	0: ldmdb r1!, COPY_REGISTERS
347	subs r2, r2, #0x40
348	stmdb r0!, COPY_REGISTERS
349	pld [r1, #-0x80]
350	ldmdb r1!, COPY_REGISTERS
351	pld [r1, #-0x80]
352	stmdb r0!, COPY_REGISTERS
353	bhs 0b
354	pop ADDITIONAL_CALLEE_SAVE_REGISTERS
355
356	L_descendingLengthLessThanSixtyFour:
357	// Cleanup copy of up to 63 bytes. We can assume that both the source and
358	// destination addresses have word alignment here.
359	tst r2, #0x30
360	beq 1f
361	0: ldmdb r1!, {r3,r4,r9,ip}
362	sub r2, r2, #0x10
363	stmdb r0!, {r3,r4,r9,ip}
364	tst r2, #0x30
365	bne 0b
366	1: tst r2, #0xf
367	beq 2f
368	lsls ip, r2, #29
369	ldmdbcs r1!, {r3,ip}
370	stmdbcs r0!, {r3,ip}
371	ldrmi r3, [r1, #-4]!
372	strmi r3, [r0, #-4]!
373	lsls ip, r2, #31
374	ldrhcs r3, [r1, #-2]!
375	strhcs r3, [r0, #-2]!
376	ldrbmi ip, [r1, #-1]
377	strbmi ip, [r0, #-1]
378	2: CLEAR_FRAME_AND_RETURN
379
380	/*****************************************************************************
381	* descending copy, source buffer is not word aligned *
382	*****************************************************************************/
383
384	L_descendingUnalignedCopy:
385	// Destination buffer is word aligned, but source buffer is not. Copy
386	// byte-by-byte until the destination buffer has eightbyte alignment.
387	subs r2, #4
388	blo L_descendingUnalignedByteCleanup
389	0: tst r0, #0x7
390	beq L_descendingUnalignedVectorCopy
391	ldrb r3, [r1, #-1]!
392	subs r2, #1
393	strb r3, [r0, #-1]!
394	bhs 0b
395	L_descendingUnalignedByteCleanup:
396	adds r2, #8
397	beq 1f
398	0: ldrb r3, [r1, #-1]!
399	subs r2, #1
400	strb r3, [r0, #-1]!
401	bne 0b
402	1: CLEAR_FRAME_AND_RETURN
403
404	L_descendingUnalignedVectorCopy:
405	// Destination buffer is eightbyte aligned. Source buffer has unknown
406	// alignment. Use NEON to handle the misaligned copies. We begin by copying
407	// up to 24 bytes to get cacheline alignment of the destination buffer.
408	subs r2, #0x18
409	blo L_descendingUnalignedVectorCleanup
410	0: tst r0, #0x18
411	beq L_descendingUnalignedCachelineCopy
412	sub r1, #8
413	vld1.8 {d0}, [r1]
414	sub r0, #8
415	vst1.8 {d0}, [r0,:64]
416	subs r2, #8
417	bhs 0b
418	L_descendingUnalignedVectorCleanup:
419	adds r2, #0x18
420	blo L_descendingUnalignedByteCleanup
421	0: sub r1, #8
422	vld1.8 {d0}, [r1]
423	sub r0, #8
424	vst1.8 {d0}, [r0,:64]
425	subs r2, #8
426	bhs 0b
427	b L_descendingUnalignedByteCleanup
428
429	L_descendingUnalignedCachelineCopy:
430	// Main copy loop; moves 32 bytes per iteration. Requires only byte alignment
431	// of the source address.
432	sub r1, #32
433	sub r0, #32
434	mov r4, #-32
435	0: vld1.8 {q0,q1},[r1], r4
436	pld [r1, #-0x60]
437	vst1.8 {q0,q1},[r0,:256], r4
438	subs r2, #0x20
439	bhs 0b
440	add r1, #32
441	add r0, #32
442	b L_descendingUnalignedVectorCleanup
443
444	#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD