/* * Copyright (c) 2006, 2009 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ #if defined __thumb2__ && defined __ARM_NEON__ // Use our tuned NEON implementation when it is available. Otherwise fall back // on more generic ARM code. #include "NEON/bcopy.s" #else // defined __thumb2__ && defined __ARM_NEON__ /***************************************************************************** * ARMv5 and ARMv6 implementation * *****************************************************************************/ #include .text .align 2 .globl _memcpy .globl _bcopy .globl _memmove _bcopy: /* void bcopy(const void *src, void *dest, size_t len); */ mov r3, r0 mov r0, r1 mov r1, r3 _memcpy: /* void *memcpy(void *dest, const void *src, size_t len); */ _memmove: /* void *memmove(void *dest, const void *src, size_t len); */ /* check for zero len or if the pointers are the same */ cmp r2, #0 cmpne r0, r1 bxeq lr /* save r0 (return value), r4 (scratch), and r5 (scratch) */ stmfd sp!, { r0, r4, r5, r7, lr } add r7, sp, #12 /* check for overlap. r3 <- distance between src & dest */ subhs r3, r0, r1 sublo r3, r1, r0 cmp r3, r2 /* if distance(src, dest) < len, we have overlap */ blo Loverlap Lnormalforwardcopy: /* are src and dest dissimilarly word aligned? */ mov r12, r0, lsl #30 cmp r12, r1, lsl #30 bne Lnonwordaligned_forward /* if len < 64, do a quick forward copy */ cmp r2, #64 blt Lsmallforwardcopy /* check for 16 byte src/dest unalignment */ tst r0, #0xf bne Lsimilarlyunaligned /* check for 32 byte dest unalignment */ tst r0, #(1<<4) bne Lunaligned_32 Lmorethan64_aligned: /* save some more registers to use in the copy */ stmfd sp!, { r6, r8, r10, r11 } /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */ sub r2, r2, #64 L64loop: /* copy 64 bytes at a time */ ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 } #ifdef _ARM_ARCH_6 pld [r1, #32] #endif stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 } ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 } subs r2, r2, #64 #ifdef _ARM_ARCH_6 pld [r1, #32] #endif stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 } bge L64loop /* restore the scratch registers we just saved */ ldmfd sp!, { r6, r8, r10, r11 } /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */ adds r2, r2, #64 beq Lexit Llessthan64_aligned: /* copy 16 bytes at a time until we have < 16 bytes */ cmp r2, #16 ldmgeia r1!, { r3, r4, r5, r12 } stmgeia r0!, { r3, r4, r5, r12 } subges r2, r2, #16 bgt Llessthan64_aligned beq Lexit Llessthan16_aligned: mov r2, r2, lsl #28 msr cpsr_f, r2 ldmmiia r1!, { r2, r3 } ldreq r4, [r1], #4 ldrcsh r5, [r1], #2 ldrvsb r12, [r1], #1 stmmiia r0!, { r2, r3 } streq r4, [r0], #4 strcsh r5, [r0], #2 strvsb r12, [r0], #1 b Lexit Lsimilarlyunaligned: /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */ mov r12, r0, lsl #28 rsb r12, r12, #0 msr cpsr_f, r12 ldrvsb r3, [r1], #1 ldrcsh r4, [r1], #2 ldreq r5, [r1], #4 strvsb r3, [r0], #1 strcsh r4, [r0], #2 streq r5, [r0], #4 ldmmiia r1!, { r3, r4 } stmmiia r0!, { r3, r4 } subs r2, r2, r12, lsr #28 beq Lexit Lunaligned_32: /* bring up to dest 32 byte alignment */ tst r0, #(1 << 4) ldmneia r1!, { r3, r4, r5, r12 } stmneia r0!, { r3, r4, r5, r12 } subne r2, r2, #16 /* we should now be aligned, see what copy method we should use */ cmp r2, #64 bge Lmorethan64_aligned b Llessthan64_aligned Lbytewise2: /* copy 2 bytes at a time */ subs r2, r2, #2 ldrb r3, [r1], #1 ldrplb r4, [r1], #1 strb r3, [r0], #1 strplb r4, [r0], #1 bhi Lbytewise2 b Lexit Lbytewise: /* simple bytewise forward copy */ ldrb r3, [r1], #1 subs r2, r2, #1 strb r3, [r0], #1 bne Lbytewise b Lexit Lsmallforwardcopy: /* src and dest are word aligned similarly, less than 64 bytes to copy */ cmp r2, #4 blt Lbytewise2 /* bytewise copy until word aligned */ tst r1, #3 Lwordalignloop: ldrneb r3, [r1], #1 strneb r3, [r0], #1 subne r2, r2, #1 tstne r1, #3 bne Lwordalignloop cmp r2, #16 bge Llessthan64_aligned blt Llessthan16_aligned Loverlap: /* src and dest overlap in some way, len > 0 */ cmp r0, r1 /* if dest > src */ bhi Loverlap_srclower Loverlap_destlower: /* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */ cmp r3, #64 bge Lnormalforwardcopy /* overlap is greater than one stride of the copy, use normal copy */ cmp r3, #2 bge Lbytewise2 b Lbytewise /* the following routines deal with having to copy in the reverse direction */ Loverlap_srclower: /* src < dest, with overlap */ /* src += len; dest += len; */ add r0, r0, r2 add r1, r1, r2 /* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */ cmp r2, #64 /* less than 64 bytes to copy? */ cmpgt r3, #64 /* less than 64 bytes of nonoverlap? */ blt Lbytewise_reverse /* test of src and dest are nonword aligned differently */ mov r3, r0, lsl #30 cmp r3, r1, lsl #30 bne Lbytewise_reverse /* test if src and dest are non word aligned or dest is non 16 byte aligned */ tst r0, #0xf bne Lunaligned_reverse_similarly /* test for dest 32 byte alignment */ tst r0, #(1<<4) bne Lunaligned_32_reverse_similarly /* 64 byte reverse block copy, src and dest aligned */ Lmorethan64_aligned_reverse: /* save some more registers to use in the copy */ stmfd sp!, { r6, r8, r10, r11 } /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */ sub r2, r2, #64 L64loop_reverse: /* copy 64 bytes at a time */ ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 } #ifdef _ARM_ARCH_6 pld [r1, #-32] #endif stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 } ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 } subs r2, r2, #64 #ifdef _ARM_ARCH_6 pld [r1, #-32] #endif stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 } bge L64loop_reverse /* restore the scratch registers we just saved */ ldmfd sp!, { r6, r8, r10, r11 } /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */ adds r2, r2, #64 beq Lexit Lbytewise_reverse: ldrb r3, [r1, #-1]! strb r3, [r0, #-1]! subs r2, r2, #1 bne Lbytewise_reverse b Lexit Lunaligned_reverse_similarly: /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */ mov r12, r0, lsl #28 msr cpsr_f, r12 ldrvsb r3, [r1, #-1]! ldrcsh r4, [r1, #-2]! ldreq r5, [r1, #-4]! strvsb r3, [r0, #-1]! strcsh r4, [r0, #-2]! streq r5, [r0, #-4]! ldmmidb r1!, { r3, r4 } stmmidb r0!, { r3, r4 } subs r2, r2, r12, lsr #28 beq Lexit Lunaligned_32_reverse_similarly: /* bring up to dest 32 byte alignment */ tst r0, #(1 << 4) ldmnedb r1!, { r3, r4, r5, r12 } stmnedb r0!, { r3, r4, r5, r12 } subne r2, r2, #16 /* we should now be aligned, see what copy method we should use */ cmp r2, #64 bge Lmorethan64_aligned_reverse b Lbytewise_reverse /* the following routines deal with non word aligned copies */ Lnonwordaligned_forward: cmp r2, #8 blt Lbytewise2 /* not worth the effort with less than 24 bytes total */ /* bytewise copy until src word aligned */ tst r1, #3 Lwordalignloop2: ldrneb r3, [r1], #1 strneb r3, [r0], #1 subne r2, r2, #1 tstne r1, #3 bne Lwordalignloop2 /* figure out how the src and dest are unaligned */ and r3, r0, #3 cmp r3, #2 blt Lalign1_forward beq Lalign2_forward bgt Lalign3_forward Lalign1_forward: /* the dest pointer is 1 byte off from src */ mov r12, r2, lsr #2 /* number of words we should copy */ sub r0, r0, #1 /* prime the copy */ ldrb r4, [r0] /* load D[7:0] */ Lalign1_forward_loop: ldr r3, [r1], #4 /* load S */ orr r4, r4, r3, lsl #8 /* D[31:8] = S[24:0] */ str r4, [r0], #4 /* save D */ mov r4, r3, lsr #24 /* D[7:0] = S[31:25] */ subs r12, r12, #1 bne Lalign1_forward_loop /* finish the copy off */ strb r4, [r0], #1 /* save D[7:0] */ ands r2, r2, #3 beq Lexit b Lbytewise2 Lalign2_forward: /* the dest pointer is 2 bytes off from src */ mov r12, r2, lsr #2 /* number of words we should copy */ sub r0, r0, #2 /* prime the copy */ ldrh r4, [r0] /* load D[15:0] */ Lalign2_forward_loop: ldr r3, [r1], #4 /* load S */ orr r4, r4, r3, lsl #16 /* D[31:16] = S[15:0] */ str r4, [r0], #4 /* save D */ mov r4, r3, lsr #16 /* D[15:0] = S[31:15] */ subs r12, r12, #1 bne Lalign2_forward_loop /* finish the copy off */ strh r4, [r0], #2 /* save D[15:0] */ ands r2, r2, #3 beq Lexit b Lbytewise2 Lalign3_forward: /* the dest pointer is 3 bytes off from src */ mov r12, r2, lsr #2 /* number of words we should copy */ sub r0, r0, #3 /* prime the copy */ ldr r4, [r0] and r4, r4, #0x00ffffff /* load D[24:0] */ Lalign3_forward_loop: ldr r3, [r1], #4 /* load S */ orr r4, r4, r3, lsl #24 /* D[31:25] = S[7:0] */ str r4, [r0], #4 /* save D */ mov r4, r3, lsr #8 /* D[24:0] = S[31:8] */ subs r12, r12, #1 bne Lalign3_forward_loop /* finish the copy off */ strh r4, [r0], #2 /* save D[15:0] */ mov r4, r4, lsr #16 strb r4, [r0], #1 /* save D[23:16] */ ands r2, r2, #3 beq Lexit b Lbytewise2 Lexit: ldmfd sp!, {r0, r4, r5, r7, pc} #endif // defined __thumb2__ && defined __ARM_NEON__