arm/string/bcopy_CortexA9.s

   1 /*
   2  * Copyright (c) 2010 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  *
  23  *  This file implements the following functions for the Cortex-A9 processor:
  24  *
  25  *  void bcopy(const void * source,
  26  *             void * destination,
  27  *             size_t length);
  28  *
  29  *  void *memmove(void * destination,
  30  *                const void * source,
  31  *                size_t n);
  32  *
  33  *  void *memcpy(void * restrict destination,
  34  *               const void * restrict source,
  35  *               size_t n);
  36  *
  37  * All copy n successive bytes from source to destination.  Memmove and memcpy
  38  * return destination, whereas bcopy has no return value.  Copying takes place
  39  * as if it were through a temporary buffer -- after return destination
  40  * contains exactly the bytes from source, even if the buffers overlap (this is
  41  * not required of memcpy by the C standard; its behavior is undefined if the
  42  * buffers overlap, but we are holding ourselves to the historical behavior of
  43  * this function on OS X and iOS).
  44  */
  45
  46 #include <arm/arch.h>
  47 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
  48
  49 /*****************************************************************************
  50  * Macros                                                                    *
  51  *****************************************************************************/
  52
  53 #define A9_ENTRY(name) \
  54         .align 2;\
  55         .globl _ ## name ## $VARIANT$CortexA9;\
  56         _ ## name ## $VARIANT$CortexA9:
  57
  58 #define ESTABLISH_FRAME \
  59         push   {r0,r4,r7,lr};\
  60         add     r7,     sp, #8
  61
  62 #define CLEAR_FRAME_AND_RETURN \
  63         pop    {r0,r4,r7,pc}
  64
  65 #define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}
  66
  67 #define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}
  68
  69 /*****************************************************************************
  70  *  entry points                                                             *
  71  *****************************************************************************/
  72
  73 .text
  74 .syntax unified
  75 .code 32
  76
  77 A9_ENTRY(bcopy)
  78 //  Translate bcopy calls into memcpy calls by swapping the first and second
  79 //  arguments.
  80         mov     r3,     r0
  81         mov     r0,     r1
  82         mov     r1,     r3
  83
  84 A9_ENTRY(memcpy)
  85 A9_ENTRY(memmove)
  86 //  Our preference is to copy the data in ascending address order, but if the
  87 //  buffers overlap such that the beginning of the destination buffer aliases
  88 //  the end of the source buffer, we need to copy in descending address order
  89 //  instead to preserve the memmove semantics.  We detect this case with the
  90 //  test:
  91 //
  92 //      destination - source < length    (unsigned compare)
  93 //
  94 //  If the address of the source buffer is higher than the address of the
  95 //  destination buffer, this arithmetic can overflow, but the overflowed value
  96 //  can only be smaller than length if the buffers do not overlap, so we don't
  97 //  need to worry about false positives due to the overflow (they happen, but
  98 //  only in cases where copying in either order is correct).
  99         subs    r3,     r0, r1
 100         bxeq    lr
 101         ESTABLISH_FRAME
 102         cmp     r3,     r2
 103         blo     L_descendingCopy
 104
 105 /*****************************************************************************
 106  *  ascending copy                                                           *
 107  *****************************************************************************/
 108
 109 //  The layout of the two buffers is such that we can use our preferred
 110 //  (ascending address order) copy implementation.  Throughout this copy,
 111 //  registers are used as follows:
 112 //
 113 //      r0  lowest unwritten address in the destination buffer.
 114 //      r1  lowest unread address in the source buffer.
 115 //      r2  number of bytes remaining to copy less an offset that varies
 116 //          with the size of the copies that are being made.
 117 //      r3, r4, r5, r6, r8, r9, r10, r12
 118 //          temporary registers used to hold the data during copies.
 119 //      r12 also used as a scratch register for alignment / length calculations
 120
 121 L_ascendingCopy:
 122 //  We begin by checking if less than four bytes are to be copied; if so, we
 123 //  branch directly to a small-buffer copy and return.  Otherwise, we copy up
 124 //  to three bytes if needed to make the destination pointer have word (four
 125 //  byte) alignment.
 126         subs    r2,         #4
 127         blo     L_ascendingLengthLessThanFour
 128         ands    ip,     r0, #0x3
 129         beq     L_ascendingDestinationWordAligned
 130         ldrb    r3,    [r1],#1
 131         cmp     ip,         #2
 132         ldrbls  r4,    [r1],#1
 133         strb    r3,    [r0],#1
 134         ldrblo  r3,    [r1],#1
 135         add     r2,         ip
 136         strbls  r4,    [r0],#1
 137         strblo  r3,    [r0],#1
 138         subs    r2,         #4
 139         bhs     L_ascendingDestinationWordAligned
 140
 141 L_ascendingLengthLessThanFour:
 142 //  Conditionally copies up to three bytes, assuming no alignment.  This is
 143 //  only used if the original length of the buffer is smaller than four.
 144         lsls    ip,     r2, #31
 145         ldrbcs  r3,    [r1],#1
 146         ldrbcs  ip,    [r1],#1
 147         ldrbmi  r4,    [r1]
 148         strbcs  r3,    [r0],#1
 149         strbcs  ip,    [r0],#1
 150         strbmi  r4,    [r0]
 151         CLEAR_FRAME_AND_RETURN
 152
 153 L_ascendingDestinationWordAligned:
 154 //  We know that the destination has word alignment.  If the source is not
 155 //  similarly aligned, jump to an unaligned copy loop.
 156         tst     r1,         #0x3
 157         bne             L_ascendingUnalignedCopy
 158
 159 /*****************************************************************************
 160  *  ascending copy, both buffers have word alignment                         *
 161  *****************************************************************************/
 162
 163 //  If less than sixty-four bytes remain to be copied, jump directly to the
 164 //  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
 165 //  to make the destination pointer have cacheline alignment.
 166         subs    r2,     r2, #0x3c
 167         blo     L_ascendingLengthLessThanSixtyFour
 168 0:  tst     r0,         #0x1c
 169         beq     L_ascendingDestinationCachelineAligned
 170         ldr     r3,    [r1],#4
 171         subs    r2,         #4
 172         str     r3,    [r0],#4
 173         bhs     0b
 174         b       L_ascendingLengthLessThanSixtyFour
 175
 176 L_ascendingDestinationCachelineAligned:
 177 //  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
 178 //  Empirical testing suggests that 0x60 is the optimal lookahead for preload,
 179 //  though anything between 0x40 and 0x100 seems to be "acceptable".
 180         push    ADDITIONAL_CALLEE_SAVE_REGISTERS
 181 0:      ldm     r1!,    COPY_REGISTERS
 182         subs    r2,     r2, #0x40
 183         stm     r0!,    COPY_REGISTERS
 184         pld    [r1, #0x60]
 185         ldm     r1!,    COPY_REGISTERS
 186         pld    [r1, #0x60]
 187         stm     r0!,    COPY_REGISTERS
 188         bhs     0b
 189         pop     ADDITIONAL_CALLEE_SAVE_REGISTERS
 190
 191 L_ascendingLengthLessThanSixtyFour:
 192 //  Cleanup copy of up to 63 bytes.  We can assume that both the source and
 193 //  destination addresses have word alignment here.
 194     tst     r2,         #0x30
 195     beq     1f
 196 0:  ldm     r1!,   {r3,r4,r9,ip}
 197     sub     r2,     r2, #0x10
 198     stm     r0!,   {r3,r4,r9,ip}
 199     tst     r2,         #0x30
 200     bne     0b
 201 1:  tst     r2,         #0xf
 202     beq     2f
 203     lsls    ip,     r2, #29
 204     ldmcs   r1!,   {r3,ip}
 205     stmcs   r0!,   {r3,ip}
 206     ldrmi   r3,    [r1],#4
 207     strmi   r3,    [r0],#4
 208         lsls    ip,     r2, #31
 209         ldrhcs  r3,    [r1],#2
 210         strhcs  r3,    [r0],#2
 211         ldrbmi  ip,    [r1]
 212         strbmi  ip,    [r0]
 213 2:  CLEAR_FRAME_AND_RETURN
 214
 215 /*****************************************************************************
 216  *  ascending copy, source buffer is not word aligned                        *
 217  *****************************************************************************/
 218
 219 L_ascendingUnalignedCopy:
 220 //  Destination buffer is word aligned, but source buffer is not.  Copy
 221 //  byte-by-byte until the destination buffer has eightbyte alignment.
 222     subs    r2,         #4
 223     blo     L_ascendingUnalignedByteCleanup
 224 0:  tst     r0,         #0x7
 225     beq     L_ascendingUnalignedVectorCopy
 226     ldrb    r3,    [r1],#1
 227     subs    r2,         #1
 228     strb    r3,    [r0],#1
 229     bhs     0b
 230 L_ascendingUnalignedByteCleanup:
 231     adds    r2,         #8
 232     beq     1f
 233 0:  ldrb    r3,    [r1],#1
 234     subs    r2,         #1
 235     strb    r3,    [r0],#1
 236     bne     0b
 237 1:  CLEAR_FRAME_AND_RETURN
 238
 239 L_ascendingUnalignedVectorCopy:
 240 //  Destination buffer is eightbyte aligned.  Source buffer has unknown
 241 //  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
 242 //  up to 24 bytes to get cacheline alignment of the destination buffer.
 243     subs    r2,         #0x18
 244     blo     L_ascendingUnalignedVectorCleanup
 245 0:  tst     r0,         #0x18
 246     beq     L_ascendingUnalignedCachelineCopy
 247     vld1.8 {d0},   [r1]!
 248     subs    r2,         #8
 249     vst1.8 {d0},   [r0,:64]!
 250     bhs     0b
 251 L_ascendingUnalignedVectorCleanup:
 252     adds    r2,         #0x18
 253     blo     L_ascendingUnalignedByteCleanup
 254 0:  vld1.8 {d0},   [r1]!
 255     subs    r2,         #8
 256     vst1.8 {d0},   [r0,:64]!
 257     bhs     0b
 258     b       L_ascendingUnalignedByteCleanup
 259
 260 L_ascendingUnalignedCachelineCopy:
 261 //  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
 262 //  of the source address.
 263     vld1.8 {q0,q1},[r1]!
 264     pld    [r1, #0x60]
 265     vst1.8 {q0,q1},[r0,:256]!
 266     subs    r2,         #0x20
 267     bhs     L_ascendingUnalignedCachelineCopy
 268     b       L_ascendingUnalignedVectorCleanup
 269
 270 /*****************************************************************************
 271  *  descending copy                                                          *
 272  *****************************************************************************/
 273
 274 //  The layout of the two buffers is such that we must copy in descending-
 275 //  address order.  Throughout this copy, registers are used as follows:
 276 //
 277 //      r0  lowest address in the destination buffer that has been written to.
 278 //      r1  lowest address in the source buffer that has been read from.
 279 //      r2  number of bytes remaining to copy less an offset that varies
 280 //          with the size of the copies that are being made.
 281 //      r3, r4, r5, r6, r8, r9, r10, r12
 282 //          temporary registers used to hold the data during copies.
 283 //      r12 also used as a scratch register for alignment / length calculations
 284
 285 L_descendingCopy:
 286 //  We begin by checking if less than four bytes are to be copied; if so, we
 287 //  branch directly to a small-buffer copy and return.  Otherwise, we copy up
 288 //  to three bytes if needed to make the destination pointer have word (four
 289 //  byte) alignment.
 290     add     r1,     r2
 291     add     r0,     r2
 292     subs    r2,         #4
 293         blo     L_descendingLengthLessThanFour
 294         ands    ip,     r0, #0x3
 295         beq     L_descendingDestinationWordAligned
 296         ldrb    r3,    [r1, #-1]!
 297         cmp     ip,         #2
 298         ldrbhs  r4,    [r1, #-1]!
 299         strb    r3,    [r0, #-1]!
 300         ldrbhi  r3,    [r1, #-1]!
 301         strbhs  r4,    [r0, #-1]!
 302         strbhi  r3,    [r0, #-1]!
 303         subs    r2,         ip
 304         bhs     L_descendingDestinationWordAligned
 305
 306 L_descendingLengthLessThanFour:
 307 //  Conditionally copies up to three bytes, assuming no alignment.  This is
 308 //  only used if the original length of the buffer is smaller than four.
 309         lsls    ip,     r2, #31
 310         ldrbcs  r3,    [r1, #-1]!
 311         ldrbcs  ip,    [r1, #-1]!
 312         ldrbmi  r4,    [r1, #-1]
 313         strbcs  r3,    [r0, #-1]!
 314         strbcs  ip,    [r0, #-1]!
 315         strbmi  r4,    [r0, #-1]
 316         CLEAR_FRAME_AND_RETURN
 317
 318 L_descendingDestinationWordAligned:
 319 //  We know that the destination has word alignment.  If the source is not
 320 //  similarly aligned, jump to an unaligned copy loop.
 321         tst     r1,         #0x3
 322         bne             L_descendingUnalignedCopy
 323
 324 /*****************************************************************************
 325  *  descending copy, both buffers have word alignment                        *
 326  *****************************************************************************/
 327
 328 //  If less than sixty-four bytes remain to be copied, jump directly to the
 329 //  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
 330 //  to make the destination pointer have cacheline alignment.
 331         subs    r2,     r2, #0x3c
 332         blo     L_descendingLengthLessThanSixtyFour
 333 0:  tst     r0,         #0x1c
 334         beq     L_descendingDestinationCachelineAligned
 335         ldr     r3,    [r1, #-4]!
 336         subs    r2,         #4
 337         str     r3,    [r0, #-4]!
 338         bhs     0b
 339         b       L_descendingLengthLessThanSixtyFour
 340
 341 L_descendingDestinationCachelineAligned:
 342 //  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
 343 //  Empirical testing suggests that -0x80 is the optimal lookahead for preload,
 344 //  though anything between -0x40 and -0x100 seems to be "acceptable".
 345         push    ADDITIONAL_CALLEE_SAVE_REGISTERS
 346 0:      ldmdb   r1!,    COPY_REGISTERS
 347         subs    r2,     r2, #0x40
 348         stmdb   r0!,    COPY_REGISTERS
 349         pld    [r1, #-0x80]
 350         ldmdb   r1!,    COPY_REGISTERS
 351         pld    [r1, #-0x80]
 352         stmdb   r0!,    COPY_REGISTERS
 353         bhs     0b
 354         pop     ADDITIONAL_CALLEE_SAVE_REGISTERS
 355
 356 L_descendingLengthLessThanSixtyFour:
 357 //  Cleanup copy of up to 63 bytes.  We can assume that both the source and
 358 //  destination addresses have word alignment here.
 359     tst     r2,         #0x30
 360     beq     1f
 361 0:  ldmdb   r1!,   {r3,r4,r9,ip}
 362     sub     r2,     r2, #0x10
 363     stmdb   r0!,   {r3,r4,r9,ip}
 364     tst     r2,         #0x30
 365     bne     0b
 366 1:  tst     r2,         #0xf
 367     beq     2f
 368     lsls    ip,     r2, #29
 369     ldmdbcs r1!,   {r3,ip}
 370     stmdbcs r0!,   {r3,ip}
 371     ldrmi   r3,    [r1, #-4]!
 372     strmi   r3,    [r0, #-4]!
 373         lsls    ip,     r2, #31
 374         ldrhcs  r3,    [r1, #-2]!
 375         strhcs  r3,    [r0, #-2]!
 376         ldrbmi  ip,    [r1, #-1]
 377         strbmi  ip,    [r0, #-1]
 378 2:  CLEAR_FRAME_AND_RETURN
 379
 380 /*****************************************************************************
 381  *  descending copy, source buffer is not word aligned                       *
 382  *****************************************************************************/
 383
 384 L_descendingUnalignedCopy:
 385 //  Destination buffer is word aligned, but source buffer is not.  Copy
 386 //  byte-by-byte until the destination buffer has eightbyte alignment.
 387     subs    r2,         #4
 388     blo     L_descendingUnalignedByteCleanup
 389 0:  tst     r0,         #0x7
 390     beq     L_descendingUnalignedVectorCopy
 391     ldrb    r3,    [r1, #-1]!
 392     subs    r2,         #1
 393     strb    r3,    [r0, #-1]!
 394     bhs     0b
 395 L_descendingUnalignedByteCleanup:
 396     adds    r2,         #8
 397     beq     1f
 398 0:  ldrb    r3,    [r1, #-1]!
 399     subs    r2,         #1
 400     strb    r3,    [r0, #-1]!
 401     bne     0b
 402 1:  CLEAR_FRAME_AND_RETURN
 403
 404 L_descendingUnalignedVectorCopy:
 405 //  Destination buffer is eightbyte aligned.  Source buffer has unknown
 406 //  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
 407 //  up to 24 bytes to get cacheline alignment of the destination buffer.
 408     subs    r2,         #0x18
 409     blo     L_descendingUnalignedVectorCleanup
 410 0:  tst     r0,         #0x18
 411     beq     L_descendingUnalignedCachelineCopy
 412     sub     r1,         #8
 413     vld1.8 {d0},   [r1]
 414     sub     r0,         #8
 415     vst1.8 {d0},   [r0,:64]
 416     subs    r2,         #8
 417     bhs     0b
 418 L_descendingUnalignedVectorCleanup:
 419     adds    r2,         #0x18
 420     blo     L_descendingUnalignedByteCleanup
 421 0:  sub     r1,         #8
 422     vld1.8 {d0},   [r1]
 423     sub     r0,         #8
 424     vst1.8 {d0},   [r0,:64]
 425     subs    r2,         #8
 426     bhs     0b
 427     b       L_descendingUnalignedByteCleanup
 428
 429 L_descendingUnalignedCachelineCopy:
 430 //  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
 431 //  of the source address.
 432     sub     r1,         #32
 433     sub     r0,         #32
 434     mov     r4,         #-32
 435 0:  vld1.8 {q0,q1},[r1], r4
 436     pld    [r1, #-0x60]
 437     vst1.8 {q0,q1},[r0,:256], r4
 438     subs    r2,         #0x20
 439     bhs     0b
 440     add     r1,         #32
 441     add     r0,         #32
 442     b       L_descendingUnalignedVectorCleanup
 443
 444 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD