2 * Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
24 #if defined __thumb2__ && defined __ARM_NEON__
26 // Use our tuned NEON implementation when it is available. Otherwise fall back
27 // on more generic ARM code.
29 #include "NEON/bcopy.s"
31 #else // defined __thumb2__ && defined __ARM_NEON__
33 /*****************************************************************************
34 * ARMv5 and ARMv6 implementation *
35 *****************************************************************************/
46 _bcopy: /* void bcopy(const void *src, void *dest, size_t len); */
51 _memcpy: /* void *memcpy(void *dest, const void *src, size_t len); */
52 _memmove: /* void *memmove(void *dest, const void *src, size_t len); */
53 /* check for zero len or if the pointers are the same */
58 /* save r0 (return value), r4 (scratch), and r5 (scratch) */
59 stmfd sp!, { r0, r4, r5, r7, lr }
62 /* check for overlap. r3 <- distance between src & dest */
65 cmp r3, r2 /* if distance(src, dest) < len, we have overlap */
69 /* are src and dest dissimilarly word aligned? */
72 bne Lnonwordaligned_forward
74 /* if len < 64, do a quick forward copy */
78 /* check for 16 byte src/dest unalignment */
80 bne Lsimilarlyunaligned
82 /* check for 32 byte dest unalignment */
87 /* save some more registers to use in the copy */
88 stmfd sp!, { r6, r8, r10, r11 }
90 /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
94 /* copy 64 bytes at a time */
95 ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
99 stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
100 ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
105 stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
108 /* restore the scratch registers we just saved */
109 ldmfd sp!, { r6, r8, r10, r11 }
111 /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
116 /* copy 16 bytes at a time until we have < 16 bytes */
118 ldmgeia r1!, { r3, r4, r5, r12 }
119 stmgeia r0!, { r3, r4, r5, r12 }
121 bgt Llessthan64_aligned
128 ldmmiia r1!, { r2, r3 }
133 stmmiia r0!, { r2, r3 }
140 /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
153 ldmmiia r1!, { r3, r4 }
154 stmmiia r0!, { r3, r4 }
156 subs r2, r2, r12, lsr #28
160 /* bring up to dest 32 byte alignment */
162 ldmneia r1!, { r3, r4, r5, r12 }
163 stmneia r0!, { r3, r4, r5, r12 }
166 /* we should now be aligned, see what copy method we should use */
168 bge Lmorethan64_aligned
169 b Llessthan64_aligned
172 /* copy 2 bytes at a time */
185 /* simple bytewise forward copy */
193 /* src and dest are word aligned similarly, less than 64 bytes to copy */
197 /* bytewise copy until word aligned */
207 bge Llessthan64_aligned
208 blt Llessthan16_aligned
211 /* src and dest overlap in some way, len > 0 */
212 cmp r0, r1 /* if dest > src */
213 bhi Loverlap_srclower
216 /* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
218 bge Lnormalforwardcopy /* overlap is greater than one stride of the copy, use normal copy */
224 /* the following routines deal with having to copy in the reverse direction */
226 /* src < dest, with overlap */
228 /* src += len; dest += len; */
232 /* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
233 cmp r2, #64 /* less than 64 bytes to copy? */
234 cmpgt r3, #64 /* less than 64 bytes of nonoverlap? */
235 blt Lbytewise_reverse
237 /* test of src and dest are nonword aligned differently */
240 bne Lbytewise_reverse
242 /* test if src and dest are non word aligned or dest is non 16 byte aligned */
244 bne Lunaligned_reverse_similarly
246 /* test for dest 32 byte alignment */
248 bne Lunaligned_32_reverse_similarly
250 /* 64 byte reverse block copy, src and dest aligned */
251 Lmorethan64_aligned_reverse:
252 /* save some more registers to use in the copy */
253 stmfd sp!, { r6, r8, r10, r11 }
255 /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
259 /* copy 64 bytes at a time */
260 ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
264 stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
265 ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
270 stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
273 /* restore the scratch registers we just saved */
274 ldmfd sp!, { r6, r8, r10, r11 }
276 /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
284 bne Lbytewise_reverse
287 Lunaligned_reverse_similarly:
288 /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
292 ldrvsb r3, [r1, #-1]!
293 ldrcsh r4, [r1, #-2]!
296 strvsb r3, [r0, #-1]!
297 strcsh r4, [r0, #-2]!
300 ldmmidb r1!, { r3, r4 }
301 stmmidb r0!, { r3, r4 }
303 subs r2, r2, r12, lsr #28
306 Lunaligned_32_reverse_similarly:
307 /* bring up to dest 32 byte alignment */
309 ldmnedb r1!, { r3, r4, r5, r12 }
310 stmnedb r0!, { r3, r4, r5, r12 }
313 /* we should now be aligned, see what copy method we should use */
315 bge Lmorethan64_aligned_reverse
318 /* the following routines deal with non word aligned copies */
319 Lnonwordaligned_forward:
321 blt Lbytewise2 /* not worth the effort with less than 24 bytes total */
323 /* bytewise copy until src word aligned */
332 /* figure out how the src and dest are unaligned */
340 /* the dest pointer is 1 byte off from src */
341 mov r12, r2, lsr #2 /* number of words we should copy */
345 ldrb r4, [r0] /* load D[7:0] */
347 Lalign1_forward_loop:
348 ldr r3, [r1], #4 /* load S */
349 orr r4, r4, r3, lsl #8 /* D[31:8] = S[24:0] */
350 str r4, [r0], #4 /* save D */
351 mov r4, r3, lsr #24 /* D[7:0] = S[31:25] */
353 bne Lalign1_forward_loop
355 /* finish the copy off */
356 strb r4, [r0], #1 /* save D[7:0] */
363 /* the dest pointer is 2 bytes off from src */
364 mov r12, r2, lsr #2 /* number of words we should copy */
368 ldrh r4, [r0] /* load D[15:0] */
370 Lalign2_forward_loop:
371 ldr r3, [r1], #4 /* load S */
372 orr r4, r4, r3, lsl #16 /* D[31:16] = S[15:0] */
373 str r4, [r0], #4 /* save D */
374 mov r4, r3, lsr #16 /* D[15:0] = S[31:15] */
376 bne Lalign2_forward_loop
378 /* finish the copy off */
379 strh r4, [r0], #2 /* save D[15:0] */
386 /* the dest pointer is 3 bytes off from src */
387 mov r12, r2, lsr #2 /* number of words we should copy */
392 and r4, r4, #0x00ffffff /* load D[24:0] */
394 Lalign3_forward_loop:
395 ldr r3, [r1], #4 /* load S */
396 orr r4, r4, r3, lsl #24 /* D[31:25] = S[7:0] */
397 str r4, [r0], #4 /* save D */
398 mov r4, r3, lsr #8 /* D[24:0] = S[31:8] */
400 bne Lalign3_forward_loop
402 /* finish the copy off */
403 strh r4, [r0], #2 /* save D[15:0] */
405 strb r4, [r0], #1 /* save D[23:16] */
412 ldmfd sp!, {r0, r4, r5, r7, pc}
414 #endif // defined __thumb2__ && defined __ARM_NEON__