[apple/libc.git] / arm / string / bcopy.s

/*
 * Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

#if defined __thumb2__ && defined __ARM_NEON__
    
// Use our tuned NEON implementation when it is available.  Otherwise fall back
// on more generic ARM code.

#include "NEON/bcopy.s"
    
#else // defined __thumb2__ && defined __ARM_NEON__

/*****************************************************************************
 * ARMv5 and ARMv6 implementation                                            *
 *****************************************************************************/
 
#include <arm/arch.h>

.text
.align 2
	
	.globl _memcpy
	.globl _bcopy
	.globl _memmove

_bcopy:		/* void bcopy(const void *src, void *dest, size_t len); */
	mov		r3, r0
	mov		r0, r1
	mov		r1, r3

_memcpy:		/* void *memcpy(void *dest, const void *src, size_t len); */
_memmove: 	/* void *memmove(void *dest, const void *src, size_t len); */
	/* check for zero len or if the pointers are the same */
	cmp		r2, #0
	cmpne	r0, r1
	bxeq	lr

	/* save r0 (return value), r4 (scratch), and r5 (scratch) */
	stmfd	sp!, { r0, r4, r5, r7, lr }
	add	r7, sp, #12
	
	/* check for overlap. r3 <- distance between src & dest */
	subhs	r3, r0, r1
	sublo	r3, r1, r0
	cmp		r3, r2			/* if distance(src, dest) < len, we have overlap */
	blo		Loverlap

Lnormalforwardcopy:
	/* are src and dest dissimilarly word aligned? */
	mov		r12, r0, lsl #30
	cmp		r12, r1, lsl #30
	bne		Lnonwordaligned_forward

	/* if len < 64, do a quick forward copy */
	cmp		r2, #64
	blt		Lsmallforwardcopy

	/* check for 16 byte src/dest unalignment */
	tst		r0, #0xf
	bne		Lsimilarlyunaligned

	/* check for 32 byte dest unalignment */
	tst		r0, #(1<<4)
	bne		Lunaligned_32

Lmorethan64_aligned:
	/* save some more registers to use in the copy */
	stmfd	sp!, { r6, r8, r10, r11 }

	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
	sub		r2, r2, #64

L64loop:
	/* copy 64 bytes at a time */
	ldmia	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
#ifdef _ARM_ARCH_6
	pld		[r1, #32]
#endif
	stmia	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
	ldmia	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
	subs	r2, r2, #64
#ifdef _ARM_ARCH_6
	pld		[r1, #32]
#endif
	stmia	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
	bge		L64loop

	/* restore the scratch registers we just saved */
	ldmfd	sp!, { r6, r8, r10, r11 }

	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
	adds	r2, r2, #64
	beq		Lexit

Llessthan64_aligned:
	/* copy 16 bytes at a time until we have < 16 bytes */
	cmp		r2, #16
	ldmgeia	r1!, { r3, r4, r5, r12 }
	stmgeia	r0!, { r3, r4, r5, r12 }
	subges	r2, r2, #16
	bgt		Llessthan64_aligned
	beq		Lexit
	
Llessthan16_aligned:
	mov		r2, r2, lsl #28
	msr		cpsr_f, r2

	ldmmiia	r1!, { r2, r3 }
	ldreq	r4, [r1], #4
	ldrcsh	r5, [r1], #2
	ldrvsb	r12, [r1], #1

	stmmiia	r0!, { r2, r3 }
	streq	r4, [r0], #4
	strcsh	r5, [r0], #2
	strvsb	r12, [r0], #1
	b		Lexit

Lsimilarlyunaligned:
	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
	mov		r12, r0, lsl #28
	rsb		r12, r12, #0
	msr		cpsr_f, r12

	ldrvsb	r3, [r1], #1
	ldrcsh	r4, [r1], #2
	ldreq	r5, [r1], #4

	strvsb	r3, [r0], #1
	strcsh	r4, [r0], #2
	streq	r5, [r0], #4

	ldmmiia	r1!, { r3, r4 }
	stmmiia	r0!, { r3, r4 }

	subs	r2, r2, r12, lsr #28
	beq		Lexit

Lunaligned_32:
	/* bring up to dest 32 byte alignment */
	tst		r0, #(1 << 4)
	ldmneia	r1!, { r3, r4, r5, r12 }
	stmneia	r0!, { r3, r4, r5, r12 }
	subne	r2, r2, #16

	/* we should now be aligned, see what copy method we should use */
	cmp		r2, #64
	bge		Lmorethan64_aligned
	b		Llessthan64_aligned
	
Lbytewise2:
	/* copy 2 bytes at a time */
	subs	r2, r2, #2

	ldrb	r3, [r1], #1
	ldrplb	r4, [r1], #1

	strb	r3, [r0], #1
	strplb	r4, [r0], #1

	bhi		Lbytewise2
	b		Lexit

Lbytewise:
	/* simple bytewise forward copy */
	ldrb	r3, [r1], #1
	subs	r2, r2, #1
	strb	r3, [r0], #1
	bne		Lbytewise
	b		Lexit

Lsmallforwardcopy:
	/* src and dest are word aligned similarly, less than 64 bytes to copy */
	cmp		r2, #4
	blt		Lbytewise2

	/* bytewise copy until word aligned */
	tst		r1, #3
Lwordalignloop:
	ldrneb	r3, [r1], #1
	strneb	r3, [r0], #1
	subne	r2, r2, #1
	tstne	r1, #3
	bne		Lwordalignloop

	cmp		r2, #16
	bge		Llessthan64_aligned
	blt		Llessthan16_aligned

Loverlap:
	/* src and dest overlap in some way, len > 0 */
	cmp		r0, r1				/* if dest > src */
	bhi		Loverlap_srclower

Loverlap_destlower:
	/* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
	cmp		r3, #64
	bge		Lnormalforwardcopy 	/* overlap is greater than one stride of the copy, use normal copy */

	cmp		r3, #2
	bge		Lbytewise2
	b		Lbytewise

	/* the following routines deal with having to copy in the reverse direction */
Loverlap_srclower:
	/* src < dest, with overlap */

	/* src += len; dest += len; */
	add		r0, r0, r2
	add		r1, r1, r2

	/* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
	cmp		r2, #64				/* less than 64 bytes to copy? */
	cmpgt	r3, #64				/* less than 64 bytes of nonoverlap? */
	blt		Lbytewise_reverse

	/* test of src and dest are nonword aligned differently */
	mov		r3, r0, lsl #30
	cmp		r3, r1, lsl #30
	bne		Lbytewise_reverse

	/* test if src and dest are non word aligned or dest is non 16 byte aligned */
	tst		r0, #0xf
	bne		Lunaligned_reverse_similarly

	/* test for dest 32 byte alignment */
	tst		r0, #(1<<4)
	bne		Lunaligned_32_reverse_similarly

	/* 64 byte reverse block copy, src and dest aligned */
Lmorethan64_aligned_reverse:
	/* save some more registers to use in the copy */
	stmfd	sp!, { r6, r8, r10, r11 }

	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
	sub		r2, r2, #64

L64loop_reverse:
	/* copy 64 bytes at a time */
	ldmdb	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
#ifdef _ARM_ARCH_6
	pld		[r1, #-32]
#endif
	stmdb	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }	
	ldmdb	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }	
	subs	r2, r2, #64
#ifdef _ARM_ARCH_6
	pld		[r1, #-32]
#endif
	stmdb	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }	
	bge		L64loop_reverse

	/* restore the scratch registers we just saved */
	ldmfd	sp!, { r6, r8, r10, r11 }

	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
	adds	r2, r2, #64
	beq		Lexit

Lbytewise_reverse:
	ldrb	r3, [r1, #-1]!
	strb	r3, [r0, #-1]!
	subs	r2, r2, #1
	bne		Lbytewise_reverse
	b		Lexit

Lunaligned_reverse_similarly:
	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
	mov		r12, r0, lsl #28
	msr		cpsr_f, r12

	ldrvsb	r3, [r1, #-1]!
	ldrcsh	r4, [r1, #-2]!
	ldreq	r5, [r1, #-4]!

	strvsb	r3, [r0, #-1]!
	strcsh	r4, [r0, #-2]!
	streq	r5, [r0, #-4]!

	ldmmidb	r1!, { r3, r4 }
	stmmidb	r0!, { r3, r4 }

	subs	r2, r2, r12, lsr #28
	beq		Lexit

Lunaligned_32_reverse_similarly:
	/* bring up to dest 32 byte alignment */
	tst		r0, #(1 << 4)
	ldmnedb	r1!, { r3, r4, r5, r12 }
	stmnedb	r0!, { r3, r4, r5, r12 }
	subne	r2, r2, #16

	/* we should now be aligned, see what copy method we should use */
	cmp		r2, #64
	bge		Lmorethan64_aligned_reverse
	b		Lbytewise_reverse

	/* the following routines deal with non word aligned copies */
Lnonwordaligned_forward:
	cmp		r2, #8
	blt		Lbytewise2			/* not worth the effort with less than 24 bytes total */

	/* bytewise copy until src word aligned */
	tst		r1, #3
Lwordalignloop2:
	ldrneb	r3, [r1], #1
	strneb	r3, [r0], #1
	subne	r2, r2, #1
	tstne	r1, #3
	bne		Lwordalignloop2

	/* figure out how the src and dest are unaligned */
	and		r3, r0, #3
	cmp		r3, #2
	blt		Lalign1_forward
	beq		Lalign2_forward
	bgt		Lalign3_forward

Lalign1_forward:
	/* the dest pointer is 1 byte off from src */
	mov		r12, r2, lsr #2		/* number of words we should copy */
	sub		r0, r0, #1

	/* prime the copy */
	ldrb	r4, [r0]			/* load D[7:0] */

Lalign1_forward_loop:
	ldr		r3, [r1], #4		/* load S */
	orr		r4, r4, r3, lsl #8	/* D[31:8] = S[24:0] */
	str		r4, [r0], #4		/* save D */
	mov		r4, r3, lsr #24		/* D[7:0] = S[31:25] */
	subs	r12, r12, #1
	bne		Lalign1_forward_loop

	/* finish the copy off */
	strb	r4, [r0], #1		/* save D[7:0] */

	ands	r2, r2, #3
	beq		Lexit
	b		Lbytewise2

Lalign2_forward:
	/* the dest pointer is 2 bytes off from src */
	mov		r12, r2, lsr #2		/* number of words we should copy */
	sub		r0, r0, #2

	/* prime the copy */
	ldrh	r4, [r0]			/* load D[15:0] */

Lalign2_forward_loop:
	ldr		r3, [r1], #4		/* load S */
	orr		r4, r4, r3, lsl #16	/* D[31:16] = S[15:0] */
	str		r4, [r0], #4		/* save D */
	mov		r4, r3, lsr #16		/* D[15:0] = S[31:15] */
	subs	r12, r12, #1
	bne		Lalign2_forward_loop

	/* finish the copy off */
	strh	r4, [r0], #2		/* save D[15:0] */

	ands	r2, r2, #3
	beq		Lexit
	b		Lbytewise2

Lalign3_forward:
	/* the dest pointer is 3 bytes off from src */
	mov		r12, r2, lsr #2		/* number of words we should copy */
	sub		r0, r0, #3

	/* prime the copy */
	ldr		r4, [r0]
	and		r4, r4, #0x00ffffff	/* load D[24:0] */

Lalign3_forward_loop:
	ldr		r3, [r1], #4		/* load S */
	orr		r4, r4, r3, lsl #24	/* D[31:25] = S[7:0] */
	str		r4, [r0], #4		/* save D */
	mov		r4, r3, lsr #8		/* D[24:0] = S[31:8] */
	subs	r12, r12, #1
	bne		Lalign3_forward_loop

	/* finish the copy off */
	strh	r4, [r0], #2		/* save D[15:0] */
	mov		r4, r4, lsr #16
	strb	r4, [r0], #1		/* save D[23:16] */

	ands	r2, r2, #3
	beq		Lexit
	b		Lbytewise2

Lexit:
	ldmfd	sp!, {r0, r4, r5, r7, pc}

#endif // defined __thumb2__ && defined __ARM_NEON__
Commit	Line	Data
b5d655f7	1	/*
51282358	2	* Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
b5d655f7 A	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
	23
51282358 A	24	#if defined __thumb2__ && defined __ARM_NEON__
	25
	26	// Use our tuned NEON implementation when it is available. Otherwise fall back
	27	// on more generic ARM code.
	28
	29	#include "NEON/bcopy.s"
	30
	31	#else // defined __thumb2__ && defined __ARM_NEON__
	32
	33	/*****************************************************************************
	34	* ARMv5 and ARMv6 implementation *
	35	*****************************************************************************/
	36
b5d655f7 A	37	#include <arm/arch.h>
	38
	39	.text
	40	.align 2
	41
	42	.globl _memcpy
	43	.globl _bcopy
	44	.globl _memmove
	45
	46	_bcopy: /* void bcopy(const void src, void dest, size_t len); */
	47	mov r3, r0
	48	mov r0, r1
	49	mov r1, r3
	50
	51	_memcpy: /* void memcpy(void dest, const void src, size_t len); /
	52	_memmove: /* void memmove(void dest, const void src, size_t len); /
	53	/* check for zero len or if the pointers are the same */
	54	cmp r2, #0
	55	cmpne r0, r1
	56	bxeq lr
	57
	58	/* save r0 (return value), r4 (scratch), and r5 (scratch) */
	59	stmfd sp!, { r0, r4, r5, r7, lr }
	60	add r7, sp, #12
	61
	62	/* check for overlap. r3 <- distance between src & dest */
	63	subhs r3, r0, r1
	64	sublo r3, r1, r0
	65	cmp r3, r2 /* if distance(src, dest) < len, we have overlap */
	66	blo Loverlap
	67
	68	Lnormalforwardcopy:
	69	/* are src and dest dissimilarly word aligned? */
	70	mov r12, r0, lsl #30
	71	cmp r12, r1, lsl #30
	72	bne Lnonwordaligned_forward
	73
	74	/* if len < 64, do a quick forward copy */
	75	cmp r2, #64
	76	blt Lsmallforwardcopy
	77
	78	/* check for 16 byte src/dest unalignment */
	79	tst r0, #0xf
	80	bne Lsimilarlyunaligned
	81
	82	/* check for 32 byte dest unalignment */
	83	tst r0, #(1<<4)
	84	bne Lunaligned_32
	85
	86	Lmorethan64_aligned:
	87	/* save some more registers to use in the copy */
	88	stmfd sp!, { r6, r8, r10, r11 }
	89
	90	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
	91	sub r2, r2, #64
	92
	93	L64loop:
	94	/* copy 64 bytes at a time */
	95	ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
	96	#ifdef _ARM_ARCH_6
	97	pld [r1, #32]
	98	#endif
	99	stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
	100	ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
101	subs r2, r2, #64
102	#ifdef _ARM_ARCH_6
103	pld [r1, #32]
104	#endif
105	stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
106	bge L64loop
107
108	/* restore the scratch registers we just saved */
109	ldmfd sp!, { r6, r8, r10, r11 }
110
111	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
112	adds r2, r2, #64
113	beq Lexit
114
115	Llessthan64_aligned:
116	/* copy 16 bytes at a time until we have < 16 bytes */
117	cmp r2, #16
118	ldmgeia r1!, { r3, r4, r5, r12 }
119	stmgeia r0!, { r3, r4, r5, r12 }
120	subges r2, r2, #16
121	bgt Llessthan64_aligned
122	beq Lexit
123
124	Llessthan16_aligned:
125	mov r2, r2, lsl #28
126	msr cpsr_f, r2
127
128	ldmmiia r1!, { r2, r3 }
129	ldreq r4, [r1], #4
130	ldrcsh r5, [r1], #2
131	ldrvsb r12, [r1], #1
132
133	stmmiia r0!, { r2, r3 }
134	streq r4, [r0], #4
135	strcsh r5, [r0], #2
136	strvsb r12, [r0], #1
137	b Lexit
138
139	Lsimilarlyunaligned:
140	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
141	mov r12, r0, lsl #28
142	rsb r12, r12, #0
143	msr cpsr_f, r12
144
145	ldrvsb r3, [r1], #1
146	ldrcsh r4, [r1], #2
147	ldreq r5, [r1], #4
148
149	strvsb r3, [r0], #1
150	strcsh r4, [r0], #2
151	streq r5, [r0], #4
152
153	ldmmiia r1!, { r3, r4 }
154	stmmiia r0!, { r3, r4 }
155
156	subs r2, r2, r12, lsr #28
157	beq Lexit
158
159	Lunaligned_32:
160	/* bring up to dest 32 byte alignment */
161	tst r0, #(1 << 4)
162	ldmneia r1!, { r3, r4, r5, r12 }
163	stmneia r0!, { r3, r4, r5, r12 }
164	subne r2, r2, #16
165
166	/* we should now be aligned, see what copy method we should use */
167	cmp r2, #64
168	bge Lmorethan64_aligned
169	b Llessthan64_aligned
170
171	Lbytewise2:
172	/* copy 2 bytes at a time */
173	subs r2, r2, #2
174
175	ldrb r3, [r1], #1
176	ldrplb r4, [r1], #1
177
178	strb r3, [r0], #1
179	strplb r4, [r0], #1
180
181	bhi Lbytewise2
182	b Lexit
183
184	Lbytewise:
185	/* simple bytewise forward copy */
186	ldrb r3, [r1], #1
187	subs r2, r2, #1
188	strb r3, [r0], #1
189	bne Lbytewise
190	b Lexit
191
192	Lsmallforwardcopy:
193	/* src and dest are word aligned similarly, less than 64 bytes to copy */
194	cmp r2, #4
195	blt Lbytewise2
196
197	/* bytewise copy until word aligned */
198	tst r1, #3
199	Lwordalignloop:
200	ldrneb r3, [r1], #1
201	strneb r3, [r0], #1
202	subne r2, r2, #1
203	tstne r1, #3
204	bne Lwordalignloop
205
206	cmp r2, #16
207	bge Llessthan64_aligned
208	blt Llessthan16_aligned
209
210	Loverlap:
211	/* src and dest overlap in some way, len > 0 */
212	cmp r0, r1 /* if dest > src */
213	bhi Loverlap_srclower
214
215	Loverlap_destlower:
216	/* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
217	cmp r3, #64
218	bge Lnormalforwardcopy /* overlap is greater than one stride of the copy, use normal copy */
219
220	cmp r3, #2
221	bge Lbytewise2
222	b Lbytewise
223
224	/* the following routines deal with having to copy in the reverse direction */
225	Loverlap_srclower:
226	/* src < dest, with overlap */
227
228	/* src += len; dest += len; */
229	add r0, r0, r2
230	add r1, r1, r2
231
232	/* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
233	cmp r2, #64 /* less than 64 bytes to copy? */
234	cmpgt r3, #64 /* less than 64 bytes of nonoverlap? */
235	blt Lbytewise_reverse
236
237	/* test of src and dest are nonword aligned differently */
238	mov r3, r0, lsl #30
239	cmp r3, r1, lsl #30
240	bne Lbytewise_reverse
241
242	/* test if src and dest are non word aligned or dest is non 16 byte aligned */
243	tst r0, #0xf
244	bne Lunaligned_reverse_similarly
245
246	/* test for dest 32 byte alignment */
247	tst r0, #(1<<4)
248	bne Lunaligned_32_reverse_similarly
249
250	/* 64 byte reverse block copy, src and dest aligned */
251	Lmorethan64_aligned_reverse:
252	/* save some more registers to use in the copy */
253	stmfd sp!, { r6, r8, r10, r11 }
254
255	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
256	sub r2, r2, #64
257
258	L64loop_reverse:
259	/* copy 64 bytes at a time */
260	ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
261	#ifdef _ARM_ARCH_6
262	pld [r1, #-32]
263	#endif
264	stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
265	ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
266	subs r2, r2, #64
267	#ifdef _ARM_ARCH_6
268	pld [r1, #-32]
269	#endif
270	stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
271	bge L64loop_reverse
272
273	/* restore the scratch registers we just saved */
274	ldmfd sp!, { r6, r8, r10, r11 }
275
276	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
277	adds r2, r2, #64
278	beq Lexit
279
280	Lbytewise_reverse:
281	ldrb r3, [r1, #-1]!
282	strb r3, [r0, #-1]!
283	subs r2, r2, #1
284	bne Lbytewise_reverse
285	b Lexit
286
287	Lunaligned_reverse_similarly:
288	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
289	mov r12, r0, lsl #28
290	msr cpsr_f, r12
291
292	ldrvsb r3, [r1, #-1]!
293	ldrcsh r4, [r1, #-2]!
294	ldreq r5, [r1, #-4]!
295
296	strvsb r3, [r0, #-1]!
297	strcsh r4, [r0, #-2]!
298	streq r5, [r0, #-4]!
299
300	ldmmidb r1!, { r3, r4 }
301	stmmidb r0!, { r3, r4 }
302
303	subs r2, r2, r12, lsr #28
304	beq Lexit
305
306	Lunaligned_32_reverse_similarly:
307	/* bring up to dest 32 byte alignment */
308	tst r0, #(1 << 4)
309	ldmnedb r1!, { r3, r4, r5, r12 }
310	stmnedb r0!, { r3, r4, r5, r12 }
311	subne r2, r2, #16
312
313	/* we should now be aligned, see what copy method we should use */
314	cmp r2, #64
315	bge Lmorethan64_aligned_reverse
316	b Lbytewise_reverse
317
318	/* the following routines deal with non word aligned copies */
319	Lnonwordaligned_forward:
320	cmp r2, #8
321	blt Lbytewise2 /* not worth the effort with less than 24 bytes total */
322
323	/* bytewise copy until src word aligned */
324	tst r1, #3
325	Lwordalignloop2:
326	ldrneb r3, [r1], #1
327	strneb r3, [r0], #1
328	subne r2, r2, #1
329	tstne r1, #3
330	bne Lwordalignloop2
331
332	/* figure out how the src and dest are unaligned */
333	and r3, r0, #3
334	cmp r3, #2
335	blt Lalign1_forward
336	beq Lalign2_forward
337	bgt Lalign3_forward
338
339	Lalign1_forward:
340	/* the dest pointer is 1 byte off from src */
341	mov r12, r2, lsr #2 /* number of words we should copy */
342	sub r0, r0, #1
343
344	/* prime the copy */
345	ldrb r4, [r0] /* load D[7:0] */
346
347	Lalign1_forward_loop:
348	ldr r3, [r1], #4 /* load S */
349	orr r4, r4, r3, lsl #8 /* D[31:8] = S[24:0] */
350	str r4, [r0], #4 /* save D */
351	mov r4, r3, lsr #24 /* D[7:0] = S[31:25] */
352	subs r12, r12, #1
353	bne Lalign1_forward_loop
354
355	/* finish the copy off */
356	strb r4, [r0], #1 /* save D[7:0] */
357
358	ands r2, r2, #3
359	beq Lexit
360	b Lbytewise2
361
362	Lalign2_forward:
363	/* the dest pointer is 2 bytes off from src */
364	mov r12, r2, lsr #2 /* number of words we should copy */
365	sub r0, r0, #2
366
367	/* prime the copy */
368	ldrh r4, [r0] /* load D[15:0] */
369
370	Lalign2_forward_loop:
371	ldr r3, [r1], #4 /* load S */
372	orr r4, r4, r3, lsl #16 /* D[31:16] = S[15:0] */
373	str r4, [r0], #4 /* save D */
374	mov r4, r3, lsr #16 /* D[15:0] = S[31:15] */
375	subs r12, r12, #1
376	bne Lalign2_forward_loop
377
378	/* finish the copy off */
379	strh r4, [r0], #2 /* save D[15:0] */
380
381	ands r2, r2, #3
382	beq Lexit
383	b Lbytewise2
384
385	Lalign3_forward:
386	/* the dest pointer is 3 bytes off from src */
387	mov r12, r2, lsr #2 /* number of words we should copy */
388	sub r0, r0, #3
389
390	/* prime the copy */
391	ldr r4, [r0]
392	and r4, r4, #0x00ffffff /* load D[24:0] */
393
394	Lalign3_forward_loop:
395	ldr r3, [r1], #4 /* load S */
396	orr r4, r4, r3, lsl #24 /* D[31:25] = S[7:0] */
397	str r4, [r0], #4 /* save D */
398	mov r4, r3, lsr #8 /* D[24:0] = S[31:8] */
399	subs r12, r12, #1
400	bne Lalign3_forward_loop
401
402	/* finish the copy off */
403	strh r4, [r0], #2 /* save D[15:0] */
404	mov r4, r4, lsr #16
405	strb r4, [r0], #1 /* save D[23:16] */
406
407	ands r2, r2, #3
408	beq Lexit
409	b Lbytewise2
410
411	Lexit:
412	ldmfd sp!, {r0, r4, r5, r7, pc}
413
51282358	414	#endif // defined __thumb2__ && defined __ARM_NEON__
b5d655f7	415