arm/string/bcopy.s

   1 /*
   2  * Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 #if defined __thumb2__ && defined __ARM_NEON__
  25
  26 // Use our tuned NEON implementation when it is available.  Otherwise fall back
  27 // on more generic ARM code.
  28
  29 #include "NEON/bcopy.s"
  30
  31 #else // defined __thumb2__ && defined __ARM_NEON__
  32
  33 /*****************************************************************************
  34  * ARMv5 and ARMv6 implementation                                            *
  35  *****************************************************************************/
  36
  37 #include <arm/arch.h>
  38
  39 .text
  40 .align 2
  41
  42         .globl _memcpy
  43         .globl _bcopy
  44         .globl _memmove
  45
  46 _bcopy:         /* void bcopy(const void *src, void *dest, size_t len); */
  47         mov             r3, r0
  48         mov             r0, r1
  49         mov             r1, r3
  50
  51 _memcpy:                /* void *memcpy(void *dest, const void *src, size_t len); */
  52 _memmove:       /* void *memmove(void *dest, const void *src, size_t len); */
  53         /* check for zero len or if the pointers are the same */
  54         cmp             r2, #0
  55         cmpne   r0, r1
  56         bxeq    lr
  57
  58         /* save r0 (return value), r4 (scratch), and r5 (scratch) */
  59         stmfd   sp!, { r0, r4, r5, r7, lr }
  60         add     r7, sp, #12
  61
  62         /* check for overlap. r3 <- distance between src & dest */
  63         subhs   r3, r0, r1
  64         sublo   r3, r1, r0
  65         cmp             r3, r2                  /* if distance(src, dest) < len, we have overlap */
  66         blo             Loverlap
  67
  68 Lnormalforwardcopy:
  69         /* are src and dest dissimilarly word aligned? */
  70         mov             r12, r0, lsl #30
  71         cmp             r12, r1, lsl #30
  72         bne             Lnonwordaligned_forward
  73
  74         /* if len < 64, do a quick forward copy */
  75         cmp             r2, #64
  76         blt             Lsmallforwardcopy
  77
  78         /* check for 16 byte src/dest unalignment */
  79         tst             r0, #0xf
  80         bne             Lsimilarlyunaligned
  81
  82         /* check for 32 byte dest unalignment */
  83         tst             r0, #(1<<4)
  84         bne             Lunaligned_32
  85
  86 Lmorethan64_aligned:
  87         /* save some more registers to use in the copy */
  88         stmfd   sp!, { r6, r8, r10, r11 }
  89
  90         /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
  91         sub             r2, r2, #64
  92
  93 L64loop:
  94         /* copy 64 bytes at a time */
  95         ldmia   r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
  96 #ifdef _ARM_ARCH_6
  97         pld             [r1, #32]
  98 #endif
  99         stmia   r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
 100         ldmia   r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
 101         subs    r2, r2, #64
 102 #ifdef _ARM_ARCH_6
 103         pld             [r1, #32]
 104 #endif
 105         stmia   r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
 106         bge             L64loop
 107
 108         /* restore the scratch registers we just saved */
 109         ldmfd   sp!, { r6, r8, r10, r11 }
 110
 111         /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
 112         adds    r2, r2, #64
 113         beq             Lexit
 114
 115 Llessthan64_aligned:
 116         /* copy 16 bytes at a time until we have < 16 bytes */
 117         cmp             r2, #16
 118         ldmgeia r1!, { r3, r4, r5, r12 }
 119         stmgeia r0!, { r3, r4, r5, r12 }
 120         subges  r2, r2, #16
 121         bgt             Llessthan64_aligned
 122         beq             Lexit
 123
 124 Llessthan16_aligned:
 125         mov             r2, r2, lsl #28
 126         msr             cpsr_f, r2
 127
 128         ldmmiia r1!, { r2, r3 }
 129         ldreq   r4, [r1], #4
 130         ldrcsh  r5, [r1], #2
 131         ldrvsb  r12, [r1], #1
 132
 133         stmmiia r0!, { r2, r3 }
 134         streq   r4, [r0], #4
 135         strcsh  r5, [r0], #2
 136         strvsb  r12, [r0], #1
 137         b               Lexit
 138
 139 Lsimilarlyunaligned:
 140         /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
 141         mov             r12, r0, lsl #28
 142         rsb             r12, r12, #0
 143         msr             cpsr_f, r12
 144
 145         ldrvsb  r3, [r1], #1
 146         ldrcsh  r4, [r1], #2
 147         ldreq   r5, [r1], #4
 148
 149         strvsb  r3, [r0], #1
 150         strcsh  r4, [r0], #2
 151         streq   r5, [r0], #4
 152
 153         ldmmiia r1!, { r3, r4 }
 154         stmmiia r0!, { r3, r4 }
 155
 156         subs    r2, r2, r12, lsr #28
 157         beq             Lexit
 158
 159 Lunaligned_32:
 160         /* bring up to dest 32 byte alignment */
 161         tst             r0, #(1 << 4)
 162         ldmneia r1!, { r3, r4, r5, r12 }
 163         stmneia r0!, { r3, r4, r5, r12 }
 164         subne   r2, r2, #16
 165
 166         /* we should now be aligned, see what copy method we should use */
 167         cmp             r2, #64
 168         bge             Lmorethan64_aligned
 169         b               Llessthan64_aligned
 170
 171 Lbytewise2:
 172         /* copy 2 bytes at a time */
 173         subs    r2, r2, #2
 174
 175         ldrb    r3, [r1], #1
 176         ldrplb  r4, [r1], #1
 177
 178         strb    r3, [r0], #1
 179         strplb  r4, [r0], #1
 180
 181         bhi             Lbytewise2
 182         b               Lexit
 183
 184 Lbytewise:
 185         /* simple bytewise forward copy */
 186         ldrb    r3, [r1], #1
 187         subs    r2, r2, #1
 188         strb    r3, [r0], #1
 189         bne             Lbytewise
 190         b               Lexit
 191
 192 Lsmallforwardcopy:
 193         /* src and dest are word aligned similarly, less than 64 bytes to copy */
 194         cmp             r2, #4
 195         blt             Lbytewise2
 196
 197         /* bytewise copy until word aligned */
 198         tst             r1, #3
 199 Lwordalignloop:
 200         ldrneb  r3, [r1], #1
 201         strneb  r3, [r0], #1
 202         subne   r2, r2, #1
 203         tstne   r1, #3
 204         bne             Lwordalignloop
 205
 206         cmp             r2, #16
 207         bge             Llessthan64_aligned
 208         blt             Llessthan16_aligned
 209
 210 Loverlap:
 211         /* src and dest overlap in some way, len > 0 */
 212         cmp             r0, r1                          /* if dest > src */
 213         bhi             Loverlap_srclower
 214
 215 Loverlap_destlower:
 216         /* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
 217         cmp             r3, #64
 218         bge             Lnormalforwardcopy      /* overlap is greater than one stride of the copy, use normal copy */
 219
 220         cmp             r3, #2
 221         bge             Lbytewise2
 222         b               Lbytewise
 223
 224         /* the following routines deal with having to copy in the reverse direction */
 225 Loverlap_srclower:
 226         /* src < dest, with overlap */
 227
 228         /* src += len; dest += len; */
 229         add             r0, r0, r2
 230         add             r1, r1, r2
 231
 232         /* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
 233         cmp             r2, #64                         /* less than 64 bytes to copy? */
 234         cmpgt   r3, #64                         /* less than 64 bytes of nonoverlap? */
 235         blt             Lbytewise_reverse
 236
 237         /* test of src and dest are nonword aligned differently */
 238         mov             r3, r0, lsl #30
 239         cmp             r3, r1, lsl #30
 240         bne             Lbytewise_reverse
 241
 242         /* test if src and dest are non word aligned or dest is non 16 byte aligned */
 243         tst             r0, #0xf
 244         bne             Lunaligned_reverse_similarly
 245
 246         /* test for dest 32 byte alignment */
 247         tst             r0, #(1<<4)
 248         bne             Lunaligned_32_reverse_similarly
 249
 250         /* 64 byte reverse block copy, src and dest aligned */
 251 Lmorethan64_aligned_reverse:
 252         /* save some more registers to use in the copy */
 253         stmfd   sp!, { r6, r8, r10, r11 }
 254
 255         /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
 256         sub             r2, r2, #64
 257
 258 L64loop_reverse:
 259         /* copy 64 bytes at a time */
 260         ldmdb   r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
 261 #ifdef _ARM_ARCH_6
 262         pld             [r1, #-32]
 263 #endif
 264         stmdb   r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
 265         ldmdb   r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
 266         subs    r2, r2, #64
 267 #ifdef _ARM_ARCH_6
 268         pld             [r1, #-32]
 269 #endif
 270         stmdb   r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
 271         bge             L64loop_reverse
 272
 273         /* restore the scratch registers we just saved */
 274         ldmfd   sp!, { r6, r8, r10, r11 }
 275
 276         /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
 277         adds    r2, r2, #64
 278         beq             Lexit
 279
 280 Lbytewise_reverse:
 281         ldrb    r3, [r1, #-1]!
 282         strb    r3, [r0, #-1]!
 283         subs    r2, r2, #1
 284         bne             Lbytewise_reverse
 285         b               Lexit
 286
 287 Lunaligned_reverse_similarly:
 288         /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
 289         mov             r12, r0, lsl #28
 290         msr             cpsr_f, r12
 291
 292         ldrvsb  r3, [r1, #-1]!
 293         ldrcsh  r4, [r1, #-2]!
 294         ldreq   r5, [r1, #-4]!
 295
 296         strvsb  r3, [r0, #-1]!
 297         strcsh  r4, [r0, #-2]!
 298         streq   r5, [r0, #-4]!
 299
 300         ldmmidb r1!, { r3, r4 }
 301         stmmidb r0!, { r3, r4 }
 302
 303         subs    r2, r2, r12, lsr #28
 304         beq             Lexit
 305
 306 Lunaligned_32_reverse_similarly:
 307         /* bring up to dest 32 byte alignment */
 308         tst             r0, #(1 << 4)
 309         ldmnedb r1!, { r3, r4, r5, r12 }
 310         stmnedb r0!, { r3, r4, r5, r12 }
 311         subne   r2, r2, #16
 312
 313         /* we should now be aligned, see what copy method we should use */
 314         cmp             r2, #64
 315         bge             Lmorethan64_aligned_reverse
 316         b               Lbytewise_reverse
 317
 318         /* the following routines deal with non word aligned copies */
 319 Lnonwordaligned_forward:
 320         cmp             r2, #8
 321         blt             Lbytewise2                      /* not worth the effort with less than 24 bytes total */
 322
 323         /* bytewise copy until src word aligned */
 324         tst             r1, #3
 325 Lwordalignloop2:
 326         ldrneb  r3, [r1], #1
 327         strneb  r3, [r0], #1
 328         subne   r2, r2, #1
 329         tstne   r1, #3
 330         bne             Lwordalignloop2
 331
 332         /* figure out how the src and dest are unaligned */
 333         and             r3, r0, #3
 334         cmp             r3, #2
 335         blt             Lalign1_forward
 336         beq             Lalign2_forward
 337         bgt             Lalign3_forward
 338
 339 Lalign1_forward:
 340         /* the dest pointer is 1 byte off from src */
 341         mov             r12, r2, lsr #2         /* number of words we should copy */
 342         sub             r0, r0, #1
 343
 344         /* prime the copy */
 345         ldrb    r4, [r0]                        /* load D[7:0] */
 346
 347 Lalign1_forward_loop:
 348         ldr             r3, [r1], #4            /* load S */
 349         orr             r4, r4, r3, lsl #8      /* D[31:8] = S[24:0] */
 350         str             r4, [r0], #4            /* save D */
 351         mov             r4, r3, lsr #24         /* D[7:0] = S[31:25] */
 352         subs    r12, r12, #1
 353         bne             Lalign1_forward_loop
 354
 355         /* finish the copy off */
 356         strb    r4, [r0], #1            /* save D[7:0] */
 357
 358         ands    r2, r2, #3
 359         beq             Lexit
 360         b               Lbytewise2
 361
 362 Lalign2_forward:
 363         /* the dest pointer is 2 bytes off from src */
 364         mov             r12, r2, lsr #2         /* number of words we should copy */
 365         sub             r0, r0, #2
 366
 367         /* prime the copy */
 368         ldrh    r4, [r0]                        /* load D[15:0] */
 369
 370 Lalign2_forward_loop:
 371         ldr             r3, [r1], #4            /* load S */
 372         orr             r4, r4, r3, lsl #16     /* D[31:16] = S[15:0] */
 373         str             r4, [r0], #4            /* save D */
 374         mov             r4, r3, lsr #16         /* D[15:0] = S[31:15] */
 375         subs    r12, r12, #1
 376         bne             Lalign2_forward_loop
 377
 378         /* finish the copy off */
 379         strh    r4, [r0], #2            /* save D[15:0] */
 380
 381         ands    r2, r2, #3
 382         beq             Lexit
 383         b               Lbytewise2
 384
 385 Lalign3_forward:
 386         /* the dest pointer is 3 bytes off from src */
 387         mov             r12, r2, lsr #2         /* number of words we should copy */
 388         sub             r0, r0, #3
 389
 390         /* prime the copy */
 391         ldr             r4, [r0]
 392         and             r4, r4, #0x00ffffff     /* load D[24:0] */
 393
 394 Lalign3_forward_loop:
 395         ldr             r3, [r1], #4            /* load S */
 396         orr             r4, r4, r3, lsl #24     /* D[31:25] = S[7:0] */
 397         str             r4, [r0], #4            /* save D */
 398         mov             r4, r3, lsr #8          /* D[24:0] = S[31:8] */
 399         subs    r12, r12, #1
 400         bne             Lalign3_forward_loop
 401
 402         /* finish the copy off */
 403         strh    r4, [r0], #2            /* save D[15:0] */
 404         mov             r4, r4, lsr #16
 405         strb    r4, [r0], #1            /* save D[23:16] */
 406
 407         ands    r2, r2, #3
 408         beq             Lexit
 409         b               Lbytewise2
 410
 411 Lexit:
 412         ldmfd   sp!, {r0, r4, r5, r7, pc}
 413
 414 #endif // defined __thumb2__ && defined __ARM_NEON__
 415