Libc-825.40.1.tar.gz

[apple/libc.git] / arm / string / bcopy_Generic.s
diff --git a/arm/string/bcopy_Generic.s b/arm/string/bcopy_Generic.s

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4d5fc22ea735ba54aaddd5a0eb1e39e01bc069bf 100644 (file)
--- a/arm/string/bcopy_Generic.s
+++ b/arm/string/bcopy_Generic.s
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+/*****************************************************************************
+ * ARMv5 and ARMv6 implementation, also used in dyld on later archs          *
+ *****************************************************************************/
+ 
+#include <arm/arch.h>
+#if !defined _ARM_ARCH_7 || defined VARIANT_DYLD
+
+.text
+.align 2
+       
+       .globl _memcpy
+       .globl _bcopy
+       .globl _memmove
+
+_bcopy:                /* void bcopy(const void *src, void *dest, size_t len); */
+       mov             r3, r0
+       mov             r0, r1
+       mov             r1, r3
+
+_memcpy:               /* void *memcpy(void *dest, const void *src, size_t len); */
+_memmove:      /* void *memmove(void *dest, const void *src, size_t len); */
+       /* check for zero len or if the pointers are the same */
+       cmp             r2, #0
+       cmpne   r0, r1
+       bxeq    lr
+
+       /* save r0 (return value), r4 (scratch), and r5 (scratch) */
+       stmfd   sp!, { r0, r4, r5, r7, lr }
+       add     r7, sp, #12
+       
+       /* check for overlap. r3 <- distance between src & dest */
+       subhs   r3, r0, r1
+       sublo   r3, r1, r0
+       cmp             r3, r2                  /* if distance(src, dest) < len, we have overlap */
+       blo             Loverlap
+
+Lnormalforwardcopy:
+       /* are src and dest dissimilarly word aligned? */
+       mov             r12, r0, lsl #30
+       cmp             r12, r1, lsl #30
+       bne             Lnonwordaligned_forward
+
+       /* if len < 64, do a quick forward copy */
+       cmp             r2, #64
+       blt             Lsmallforwardcopy
+
+       /* check for 16 byte src/dest unalignment */
+       tst             r0, #0xf
+       bne             Lsimilarlyunaligned
+
+       /* check for 32 byte dest unalignment */
+       tst             r0, #(1<<4)
+       bne             Lunaligned_32
+
+Lmorethan64_aligned:
+       /* save some more registers to use in the copy */
+       stmfd   sp!, { r6, r8, r10, r11 }
+
+       /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
+       sub             r2, r2, #64
+
+L64loop:
+       /* copy 64 bytes at a time */
+       ldmia   r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+#ifdef _ARM_ARCH_6
+       pld             [r1, #32]
+#endif
+       stmia   r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+       ldmia   r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+       subs    r2, r2, #64
+#ifdef _ARM_ARCH_6
+       pld             [r1, #32]
+#endif
+       stmia   r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+       bge             L64loop
+
+       /* restore the scratch registers we just saved */
+       ldmfd   sp!, { r6, r8, r10, r11 }
+
+       /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
+       adds    r2, r2, #64
+       beq             Lexit
+
+Llessthan64_aligned:
+       /* copy 16 bytes at a time until we have < 16 bytes */
+       cmp             r2, #16
+       ldmgeia r1!, { r3, r4, r5, r12 }
+       stmgeia r0!, { r3, r4, r5, r12 }
+       subges  r2, r2, #16
+       bgt             Llessthan64_aligned
+       beq             Lexit
+       
+Llessthan16_aligned:
+       mov             r2, r2, lsl #28
+       msr             cpsr_f, r2
+
+       ldmmiia r1!, { r2, r3 }
+       ldreq   r4, [r1], #4
+       ldrcsh  r5, [r1], #2
+       ldrvsb  r12, [r1], #1
+
+       stmmiia r0!, { r2, r3 }
+       streq   r4, [r0], #4
+       strcsh  r5, [r0], #2
+       strvsb  r12, [r0], #1
+       b               Lexit
+
+Lsimilarlyunaligned:
+       /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
+       mov             r12, r0, lsl #28
+       rsb             r12, r12, #0
+       msr             cpsr_f, r12
+
+       ldrvsb  r3, [r1], #1
+       ldrcsh  r4, [r1], #2
+       ldreq   r5, [r1], #4
+
+       strvsb  r3, [r0], #1
+       strcsh  r4, [r0], #2
+       streq   r5, [r0], #4
+
+       ldmmiia r1!, { r3, r4 }
+       stmmiia r0!, { r3, r4 }
+
+       subs    r2, r2, r12, lsr #28
+       beq             Lexit
+
+Lunaligned_32:
+       /* bring up to dest 32 byte alignment */
+       tst             r0, #(1 << 4)
+       ldmneia r1!, { r3, r4, r5, r12 }
+       stmneia r0!, { r3, r4, r5, r12 }
+       subne   r2, r2, #16
+
+       /* we should now be aligned, see what copy method we should use */
+       cmp             r2, #64
+       bge             Lmorethan64_aligned
+       b               Llessthan64_aligned
+       
+Lbytewise2:
+       /* copy 2 bytes at a time */
+       subs    r2, r2, #2
+
+       ldrb    r3, [r1], #1
+       ldrplb  r4, [r1], #1
+
+       strb    r3, [r0], #1
+       strplb  r4, [r0], #1
+
+       bhi             Lbytewise2
+       b               Lexit
+
+Lbytewise:
+       /* simple bytewise forward copy */
+       ldrb    r3, [r1], #1
+       subs    r2, r2, #1
+       strb    r3, [r0], #1
+       bne             Lbytewise
+       b               Lexit
+
+Lsmallforwardcopy:
+       /* src and dest are word aligned similarly, less than 64 bytes to copy */
+       cmp             r2, #4
+       blt             Lbytewise2
+
+       /* bytewise copy until word aligned */
+       tst             r1, #3
+Lwordalignloop:
+       ldrneb  r3, [r1], #1
+       strneb  r3, [r0], #1
+       subne   r2, r2, #1
+       tstne   r1, #3
+       bne             Lwordalignloop
+
+       cmp             r2, #16
+       bge             Llessthan64_aligned
+       blt             Llessthan16_aligned
+
+Loverlap:
+       /* src and dest overlap in some way, len > 0 */
+       cmp             r0, r1                          /* if dest > src */
+       bhi             Loverlap_srclower
+
+Loverlap_destlower:
+       /* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
+       cmp             r3, #64
+       bge             Lnormalforwardcopy      /* overlap is greater than one stride of the copy, use normal copy */
+
+       cmp             r3, #2
+       bge             Lbytewise2
+       b               Lbytewise
+
+       /* the following routines deal with having to copy in the reverse direction */
+Loverlap_srclower:
+       /* src < dest, with overlap */
+
+       /* src += len; dest += len; */
+       add             r0, r0, r2
+       add             r1, r1, r2
+
+       /* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
+       cmp             r2, #64                         /* less than 64 bytes to copy? */
+       cmpgt   r3, #64                         /* less than 64 bytes of nonoverlap? */
+       blt             Lbytewise_reverse
+
+       /* test of src and dest are nonword aligned differently */
+       mov             r3, r0, lsl #30
+       cmp             r3, r1, lsl #30
+       bne             Lbytewise_reverse
+
+       /* test if src and dest are non word aligned or dest is non 16 byte aligned */
+       tst             r0, #0xf
+       bne             Lunaligned_reverse_similarly
+
+       /* test for dest 32 byte alignment */
+       tst             r0, #(1<<4)
+       bne             Lunaligned_32_reverse_similarly
+
+       /* 64 byte reverse block copy, src and dest aligned */
+Lmorethan64_aligned_reverse:
+       /* save some more registers to use in the copy */
+       stmfd   sp!, { r6, r8, r10, r11 }
+
+       /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
+       sub             r2, r2, #64
+
+L64loop_reverse:
+       /* copy 64 bytes at a time */
+       ldmdb   r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+#ifdef _ARM_ARCH_6
+       pld             [r1, #-32]
+#endif
+       stmdb   r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }      
+       ldmdb   r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }      
+       subs    r2, r2, #64
+#ifdef _ARM_ARCH_6
+       pld             [r1, #-32]
+#endif
+       stmdb   r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }      
+       bge             L64loop_reverse
+
+       /* restore the scratch registers we just saved */
+       ldmfd   sp!, { r6, r8, r10, r11 }
+
+       /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
+       adds    r2, r2, #64
+       beq             Lexit
+
+Lbytewise_reverse:
+       ldrb    r3, [r1, #-1]!
+       strb    r3, [r0, #-1]!
+       subs    r2, r2, #1
+       bne             Lbytewise_reverse
+       b               Lexit
+
+Lunaligned_reverse_similarly:
+       /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
+       mov             r12, r0, lsl #28
+       msr             cpsr_f, r12
+
+       ldrvsb  r3, [r1, #-1]!
+       ldrcsh  r4, [r1, #-2]!
+       ldreq   r5, [r1, #-4]!
+
+       strvsb  r3, [r0, #-1]!
+       strcsh  r4, [r0, #-2]!
+       streq   r5, [r0, #-4]!
+
+       ldmmidb r1!, { r3, r4 }
+       stmmidb r0!, { r3, r4 }
+
+       subs    r2, r2, r12, lsr #28
+       beq             Lexit
+
+Lunaligned_32_reverse_similarly:
+       /* bring up to dest 32 byte alignment */
+       tst             r0, #(1 << 4)
+       ldmnedb r1!, { r3, r4, r5, r12 }
+       stmnedb r0!, { r3, r4, r5, r12 }
+       subne   r2, r2, #16
+
+       /* we should now be aligned, see what copy method we should use */
+       cmp             r2, #64
+       bge             Lmorethan64_aligned_reverse
+       b               Lbytewise_reverse
+
+       /* the following routines deal with non word aligned copies */
+Lnonwordaligned_forward:
+       cmp             r2, #8
+       blt             Lbytewise2                      /* not worth the effort with less than 24 bytes total */
+
+       /* bytewise copy until src word aligned */
+       tst             r1, #3
+Lwordalignloop2:
+       ldrneb  r3, [r1], #1
+       strneb  r3, [r0], #1
+       subne   r2, r2, #1
+       tstne   r1, #3
+       bne             Lwordalignloop2
+
+       /* figure out how the src and dest are unaligned */
+       and             r3, r0, #3
+       cmp             r3, #2
+       blt             Lalign1_forward
+       beq             Lalign2_forward
+       bgt             Lalign3_forward
+
+Lalign1_forward:
+       /* the dest pointer is 1 byte off from src */
+       mov             r12, r2, lsr #2         /* number of words we should copy */
+       sub             r0, r0, #1
+
+       /* prime the copy */
+       ldrb    r4, [r0]                        /* load D[7:0] */
+
+Lalign1_forward_loop:
+       ldr             r3, [r1], #4            /* load S */
+       orr             r4, r4, r3, lsl #8      /* D[31:8] = S[24:0] */
+       str             r4, [r0], #4            /* save D */
+       mov             r4, r3, lsr #24         /* D[7:0] = S[31:25] */
+       subs    r12, r12, #1
+       bne             Lalign1_forward_loop
+
+       /* finish the copy off */
+       strb    r4, [r0], #1            /* save D[7:0] */
+
+       ands    r2, r2, #3
+       beq             Lexit
+       b               Lbytewise2
+
+Lalign2_forward:
+       /* the dest pointer is 2 bytes off from src */
+       mov             r12, r2, lsr #2         /* number of words we should copy */
+       sub             r0, r0, #2
+
+       /* prime the copy */
+       ldrh    r4, [r0]                        /* load D[15:0] */
+
+Lalign2_forward_loop:
+       ldr             r3, [r1], #4            /* load S */
+       orr             r4, r4, r3, lsl #16     /* D[31:16] = S[15:0] */
+       str             r4, [r0], #4            /* save D */
+       mov             r4, r3, lsr #16         /* D[15:0] = S[31:15] */
+       subs    r12, r12, #1
+       bne             Lalign2_forward_loop
+
+       /* finish the copy off */
+       strh    r4, [r0], #2            /* save D[15:0] */
+
+       ands    r2, r2, #3
+       beq             Lexit
+       b               Lbytewise2
+
+Lalign3_forward:
+       /* the dest pointer is 3 bytes off from src */
+       mov             r12, r2, lsr #2         /* number of words we should copy */
+       sub             r0, r0, #3
+
+       /* prime the copy */
+       ldr             r4, [r0]
+       and             r4, r4, #0x00ffffff     /* load D[24:0] */
+
+Lalign3_forward_loop:
+       ldr             r3, [r1], #4            /* load S */
+       orr             r4, r4, r3, lsl #24     /* D[31:25] = S[7:0] */
+       str             r4, [r0], #4            /* save D */
+       mov             r4, r3, lsr #8          /* D[24:0] = S[31:8] */
+       subs    r12, r12, #1
+       bne             Lalign3_forward_loop
+
+       /* finish the copy off */
+       strh    r4, [r0], #2            /* save D[15:0] */
+       mov             r4, r4, lsr #16
+       strb    r4, [r0], #1            /* save D[23:16] */
+
+       ands    r2, r2, #3
+       beq             Lexit
+       b               Lbytewise2
+
+Lexit:
+       ldmfd   sp!, {r0, r4, r5, r7, pc}
+
+#endif // !defined _ARM_ARCH_7 || defined VARIANT_DYLD
+