+++ /dev/null
-/*
- * Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- */
-
-#if defined __thumb2__ && defined __ARM_NEON__
-
-// Use our tuned NEON implementation when it is available. Otherwise fall back
-// on more generic ARM code.
-
-#include "NEON/bcopy.s"
-
-#else // defined __thumb2__ && defined __ARM_NEON__
-
-/*****************************************************************************
- * ARMv5 and ARMv6 implementation *
- *****************************************************************************/
-
-#include <arm/arch.h>
-
-.text
-.align 2
-
- .globl _memcpy
- .globl _bcopy
- .globl _memmove
-
-_bcopy: /* void bcopy(const void *src, void *dest, size_t len); */
- mov r3, r0
- mov r0, r1
- mov r1, r3
-
-_memcpy: /* void *memcpy(void *dest, const void *src, size_t len); */
-_memmove: /* void *memmove(void *dest, const void *src, size_t len); */
- /* check for zero len or if the pointers are the same */
- cmp r2, #0
- cmpne r0, r1
- bxeq lr
-
- /* save r0 (return value), r4 (scratch), and r5 (scratch) */
- stmfd sp!, { r0, r4, r5, r7, lr }
- add r7, sp, #12
-
- /* check for overlap. r3 <- distance between src & dest */
- subhs r3, r0, r1
- sublo r3, r1, r0
- cmp r3, r2 /* if distance(src, dest) < len, we have overlap */
- blo Loverlap
-
-Lnormalforwardcopy:
- /* are src and dest dissimilarly word aligned? */
- mov r12, r0, lsl #30
- cmp r12, r1, lsl #30
- bne Lnonwordaligned_forward
-
- /* if len < 64, do a quick forward copy */
- cmp r2, #64
- blt Lsmallforwardcopy
-
- /* check for 16 byte src/dest unalignment */
- tst r0, #0xf
- bne Lsimilarlyunaligned
-
- /* check for 32 byte dest unalignment */
- tst r0, #(1<<4)
- bne Lunaligned_32
-
-Lmorethan64_aligned:
- /* save some more registers to use in the copy */
- stmfd sp!, { r6, r8, r10, r11 }
-
- /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
- sub r2, r2, #64
-
-L64loop:
- /* copy 64 bytes at a time */
- ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
-#ifdef _ARM_ARCH_6
- pld [r1, #32]
-#endif
- stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
- ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
- subs r2, r2, #64
-#ifdef _ARM_ARCH_6
- pld [r1, #32]
-#endif
- stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
- bge L64loop
-
- /* restore the scratch registers we just saved */
- ldmfd sp!, { r6, r8, r10, r11 }
-
- /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
- adds r2, r2, #64
- beq Lexit
-
-Llessthan64_aligned:
- /* copy 16 bytes at a time until we have < 16 bytes */
- cmp r2, #16
- ldmgeia r1!, { r3, r4, r5, r12 }
- stmgeia r0!, { r3, r4, r5, r12 }
- subges r2, r2, #16
- bgt Llessthan64_aligned
- beq Lexit
-
-Llessthan16_aligned:
- mov r2, r2, lsl #28
- msr cpsr_f, r2
-
- ldmmiia r1!, { r2, r3 }
- ldreq r4, [r1], #4
- ldrcsh r5, [r1], #2
- ldrvsb r12, [r1], #1
-
- stmmiia r0!, { r2, r3 }
- streq r4, [r0], #4
- strcsh r5, [r0], #2
- strvsb r12, [r0], #1
- b Lexit
-
-Lsimilarlyunaligned:
- /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
- mov r12, r0, lsl #28
- rsb r12, r12, #0
- msr cpsr_f, r12
-
- ldrvsb r3, [r1], #1
- ldrcsh r4, [r1], #2
- ldreq r5, [r1], #4
-
- strvsb r3, [r0], #1
- strcsh r4, [r0], #2
- streq r5, [r0], #4
-
- ldmmiia r1!, { r3, r4 }
- stmmiia r0!, { r3, r4 }
-
- subs r2, r2, r12, lsr #28
- beq Lexit
-
-Lunaligned_32:
- /* bring up to dest 32 byte alignment */
- tst r0, #(1 << 4)
- ldmneia r1!, { r3, r4, r5, r12 }
- stmneia r0!, { r3, r4, r5, r12 }
- subne r2, r2, #16
-
- /* we should now be aligned, see what copy method we should use */
- cmp r2, #64
- bge Lmorethan64_aligned
- b Llessthan64_aligned
-
-Lbytewise2:
- /* copy 2 bytes at a time */
- subs r2, r2, #2
-
- ldrb r3, [r1], #1
- ldrplb r4, [r1], #1
-
- strb r3, [r0], #1
- strplb r4, [r0], #1
-
- bhi Lbytewise2
- b Lexit
-
-Lbytewise:
- /* simple bytewise forward copy */
- ldrb r3, [r1], #1
- subs r2, r2, #1
- strb r3, [r0], #1
- bne Lbytewise
- b Lexit
-
-Lsmallforwardcopy:
- /* src and dest are word aligned similarly, less than 64 bytes to copy */
- cmp r2, #4
- blt Lbytewise2
-
- /* bytewise copy until word aligned */
- tst r1, #3
-Lwordalignloop:
- ldrneb r3, [r1], #1
- strneb r3, [r0], #1
- subne r2, r2, #1
- tstne r1, #3
- bne Lwordalignloop
-
- cmp r2, #16
- bge Llessthan64_aligned
- blt Llessthan16_aligned
-
-Loverlap:
- /* src and dest overlap in some way, len > 0 */
- cmp r0, r1 /* if dest > src */
- bhi Loverlap_srclower
-
-Loverlap_destlower:
- /* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
- cmp r3, #64
- bge Lnormalforwardcopy /* overlap is greater than one stride of the copy, use normal copy */
-
- cmp r3, #2
- bge Lbytewise2
- b Lbytewise
-
- /* the following routines deal with having to copy in the reverse direction */
-Loverlap_srclower:
- /* src < dest, with overlap */
-
- /* src += len; dest += len; */
- add r0, r0, r2
- add r1, r1, r2
-
- /* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
- cmp r2, #64 /* less than 64 bytes to copy? */
- cmpgt r3, #64 /* less than 64 bytes of nonoverlap? */
- blt Lbytewise_reverse
-
- /* test of src and dest are nonword aligned differently */
- mov r3, r0, lsl #30
- cmp r3, r1, lsl #30
- bne Lbytewise_reverse
-
- /* test if src and dest are non word aligned or dest is non 16 byte aligned */
- tst r0, #0xf
- bne Lunaligned_reverse_similarly
-
- /* test for dest 32 byte alignment */
- tst r0, #(1<<4)
- bne Lunaligned_32_reverse_similarly
-
- /* 64 byte reverse block copy, src and dest aligned */
-Lmorethan64_aligned_reverse:
- /* save some more registers to use in the copy */
- stmfd sp!, { r6, r8, r10, r11 }
-
- /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
- sub r2, r2, #64
-
-L64loop_reverse:
- /* copy 64 bytes at a time */
- ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
-#ifdef _ARM_ARCH_6
- pld [r1, #-32]
-#endif
- stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
- ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
- subs r2, r2, #64
-#ifdef _ARM_ARCH_6
- pld [r1, #-32]
-#endif
- stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
- bge L64loop_reverse
-
- /* restore the scratch registers we just saved */
- ldmfd sp!, { r6, r8, r10, r11 }
-
- /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
- adds r2, r2, #64
- beq Lexit
-
-Lbytewise_reverse:
- ldrb r3, [r1, #-1]!
- strb r3, [r0, #-1]!
- subs r2, r2, #1
- bne Lbytewise_reverse
- b Lexit
-
-Lunaligned_reverse_similarly:
- /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
- mov r12, r0, lsl #28
- msr cpsr_f, r12
-
- ldrvsb r3, [r1, #-1]!
- ldrcsh r4, [r1, #-2]!
- ldreq r5, [r1, #-4]!
-
- strvsb r3, [r0, #-1]!
- strcsh r4, [r0, #-2]!
- streq r5, [r0, #-4]!
-
- ldmmidb r1!, { r3, r4 }
- stmmidb r0!, { r3, r4 }
-
- subs r2, r2, r12, lsr #28
- beq Lexit
-
-Lunaligned_32_reverse_similarly:
- /* bring up to dest 32 byte alignment */
- tst r0, #(1 << 4)
- ldmnedb r1!, { r3, r4, r5, r12 }
- stmnedb r0!, { r3, r4, r5, r12 }
- subne r2, r2, #16
-
- /* we should now be aligned, see what copy method we should use */
- cmp r2, #64
- bge Lmorethan64_aligned_reverse
- b Lbytewise_reverse
-
- /* the following routines deal with non word aligned copies */
-Lnonwordaligned_forward:
- cmp r2, #8
- blt Lbytewise2 /* not worth the effort with less than 24 bytes total */
-
- /* bytewise copy until src word aligned */
- tst r1, #3
-Lwordalignloop2:
- ldrneb r3, [r1], #1
- strneb r3, [r0], #1
- subne r2, r2, #1
- tstne r1, #3
- bne Lwordalignloop2
-
- /* figure out how the src and dest are unaligned */
- and r3, r0, #3
- cmp r3, #2
- blt Lalign1_forward
- beq Lalign2_forward
- bgt Lalign3_forward
-
-Lalign1_forward:
- /* the dest pointer is 1 byte off from src */
- mov r12, r2, lsr #2 /* number of words we should copy */
- sub r0, r0, #1
-
- /* prime the copy */
- ldrb r4, [r0] /* load D[7:0] */
-
-Lalign1_forward_loop:
- ldr r3, [r1], #4 /* load S */
- orr r4, r4, r3, lsl #8 /* D[31:8] = S[24:0] */
- str r4, [r0], #4 /* save D */
- mov r4, r3, lsr #24 /* D[7:0] = S[31:25] */
- subs r12, r12, #1
- bne Lalign1_forward_loop
-
- /* finish the copy off */
- strb r4, [r0], #1 /* save D[7:0] */
-
- ands r2, r2, #3
- beq Lexit
- b Lbytewise2
-
-Lalign2_forward:
- /* the dest pointer is 2 bytes off from src */
- mov r12, r2, lsr #2 /* number of words we should copy */
- sub r0, r0, #2
-
- /* prime the copy */
- ldrh r4, [r0] /* load D[15:0] */
-
-Lalign2_forward_loop:
- ldr r3, [r1], #4 /* load S */
- orr r4, r4, r3, lsl #16 /* D[31:16] = S[15:0] */
- str r4, [r0], #4 /* save D */
- mov r4, r3, lsr #16 /* D[15:0] = S[31:15] */
- subs r12, r12, #1
- bne Lalign2_forward_loop
-
- /* finish the copy off */
- strh r4, [r0], #2 /* save D[15:0] */
-
- ands r2, r2, #3
- beq Lexit
- b Lbytewise2
-
-Lalign3_forward:
- /* the dest pointer is 3 bytes off from src */
- mov r12, r2, lsr #2 /* number of words we should copy */
- sub r0, r0, #3
-
- /* prime the copy */
- ldr r4, [r0]
- and r4, r4, #0x00ffffff /* load D[24:0] */
-
-Lalign3_forward_loop:
- ldr r3, [r1], #4 /* load S */
- orr r4, r4, r3, lsl #24 /* D[31:25] = S[7:0] */
- str r4, [r0], #4 /* save D */
- mov r4, r3, lsr #8 /* D[24:0] = S[31:8] */
- subs r12, r12, #1
- bne Lalign3_forward_loop
-
- /* finish the copy off */
- strh r4, [r0], #2 /* save D[15:0] */
- mov r4, r4, lsr #16
- strb r4, [r0], #1 /* save D[23:16] */
-
- ands r2, r2, #3
- beq Lexit
- b Lbytewise2
-
-Lexit:
- ldmfd sp!, {r0, r4, r5, r7, pc}
-
-#endif // defined __thumb2__ && defined __ARM_NEON__
-