arm/string/bcopy_CortexA8.s

   1 /*
   2  * Copyright (c) 2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 #include <arm/arch.h>
  25 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
  26
  27 /*****************************************************************************
  28  * Cortex-A8 implementation                                                  *
  29  *****************************************************************************/
  30
  31 // Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
  32 //
  33 // Our tests have shown that NEON is always a performance win for memcpy( ).
  34 // However, for the specific case of copies from a warm source to a cold
  35 // destination when the buffer size is between 1k and 32k, it is not enough
  36 // of a performance win to offset the increased power footprint, resulting
  37 // in an energy usage regression.  Thus, we detect that particular case, and
  38 // pass those copies through the ARM core registers.  All other copies larger
  39 // than 8 bytes are handled on NEON.
  40 //
  41 // Stephen Canon, August 2009
  42
  43 .text
  44 .code 16
  45 .syntax unified
  46
  47 // void bcopy(const void * source,
  48 //            void * destination,
  49 //            size_t length);
  50 //
  51 // void *memmove(void * destination,
  52 //               const void * source,
  53 //               size_t n);
  54 //
  55 // void *memcpy(void * restrict destination,
  56 //              const void * restrict source,
  57 //              size_t n);
  58 //
  59 // all copy n successive bytes from source to destination. memmove and memcpy
  60 // returns destination, whereas bcopy has no return value. copying takes place
  61 // as if it were through a temporary buffer -- after return destination contains
  62 // exactly the bytes from source, even if the buffers overlap.
  63
  64 .thumb_func _bcopy$VARIANT$CortexA8
  65 .thumb_func _memmove$VARIANT$CortexA8
  66 .thumb_func _memcpy$VARIANT$CortexA8
  67 .globl _bcopy$VARIANT$CortexA8
  68 .globl _memmove$VARIANT$CortexA8
  69 .globl _memcpy$VARIANT$CortexA8
  70
  71 #define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11}
  72 #define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11}
  73
  74 /*****************************************************************************
  75  *  entry points                                                             *
  76  *****************************************************************************/
  77
  78 .align 2
  79 _bcopy$VARIANT$CortexA8:
  80
  81 // bcopy has the first and second arguments in the opposite order as the C
  82 // library functions memmove and memcpy.  If bcopy is called, we swap these
  83 // two arguments and then fall into memmove.
  84
  85     mov         r3,     r0
  86     mov         r0,     r1
  87     mov         r1,     r3
  88
  89 .align 2
  90 _memmove$VARIANT$CortexA8:
  91 _memcpy$VARIANT$CortexA8:
  92
  93 // At entry to memmove/memcpy, registers contain the following values:
  94 //
  95 //  r0  pointer to the first byte of the destination buffer
  96 //  r1  pointer to the first byte of the source buffer
  97 //  r2  number of bytes to copy
  98 //
  99 // Our preference is to use a (faster and easier to understand) front-to-back
 100 // copy of the buffer.  However, memmove requires that copies take place as
 101 // though through a temporary buffer.  This means that if the buffers overlap,
 102 // it may be necessary to copy the buffer in reverse order.
 103 //
 104 // To properly detect such overlap, we begin by computing the offset between
 105 // the source and destination pointers.  If the offset happens to be zero,
 106 // then there is no work to be done, so we can early out.
 107
 108     subs    r3,     r0, r1
 109     it      eq
 110     bxeq    lr
 111
 112 // r3 now contains the offset between the buffers, (destination - source).  If
 113 // 0 < offset < length, then the high-addressed bits of the source alias the
 114 // low addressed bytes of the destination.  Thus, if we were to perform the
 115 // copy in ascending address order, we would overwrite the high-addressed
 116 // source bytes before we had a chance to copy them, and the data would be lost.
 117 //
 118 // Thus, we can use the front-to-back copy only if offset is negative or
 119 // greater than the length.  This is the case precisely if offset compares
 120 // unsigned higher than length.
 121
 122     cmp     r3,     r2
 123     bhs     L_copyFrontToBack
 124
 125 /*****************************************************************************
 126  *  back to front copy                                                       *
 127  *****************************************************************************/
 128
 129 // Here we have fallen through into the back-to-front copy.  We preserve the
 130 // original destination pointer in r0 because it is the return value for the
 131 // routine, and update the other registers as follows:
 132 //
 133 //  r1  one byte beyond the end of the destination buffer
 134 //  r2  number of bytes to copy
 135 //  ip  one byte beyond the end of the destination buffer
 136
 137     mov      ip,    r0
 138     add      r1,    r2
 139     add      ip,    r2
 140
 141 // Subtract 8 from the buffer length; if this is negative, then we will use
 142 // only single-byte copies, and we jump directly to a scalar copy loop.
 143
 144     subs     r2,        $8
 145     blt      L_scalarReverseCopy
 146
 147 // If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
 148 // to move the data.
 149
 150     tst      ip,        $7
 151     beq      L_vectorReverseCopy
 152
 153 // Otherwise, we copy a single byte at a time, in order of descending memory
 154 // address, until the destination is 8 byte aligned.  Within this loop,
 155 // registers are used as follows:
 156 //
 157 //  r0  original destination pointer
 158 //  r1  pointer to one byte past the next element to be copied
 159 //  r2  (bytes remaining to be copied) - 8
 160 //  r3  temporary to hold the byte that is being copied
 161 //  ip  pointer one byte past the destination of the next byte to be copied
 162 //
 163 //  byte that will be copied in this iteration
 164 //                            |  byte that was copied in the previous iteration
 165 //  Source buffer:            v   v
 166 //  ------------------------+---+---+-------------------------
 167 //  bytes still to copy ... |   |   | ... bytes already copied
 168 //  ------------------------+---+---+-------------------------
 169 //                                ^
 170 //                               r1 holds the address of this byte
 171
 172 0:  ldrb     r3,   [r1, $-1]!
 173     sub      r2,        $1
 174     strb     r3,   [ip, $-1]!
 175     tst      ip,        $7
 176     bne      0b
 177
 178 // At this point, the destination pointer is 8 byte aligned.  Check again that
 179 // there are at least 8 bytes remaining to copy by comparing the remaining
 180 // length minus 8 to zero.  If fewer than 8 bytes remain, jump to the cleanup
 181 // path.
 182
 183     cmp      r2,    $0
 184     blt      L_scalarReverseCopy
 185
 186 /*****************************************************************************
 187  *  destination is 8 byte aligned                                            *
 188  *****************************************************************************/
 189
 190 L_vectorReverseCopy:
 191
 192 // At this point, registers contain the following values:
 193 //
 194 //  r0  original destination pointer
 195 //  r1  pointer to one byte past the next element to be copied
 196 //  r2  (bytes remaining to copy) - 8
 197 //  ip  pointer one byte past the destination of the next byte to be copied
 198 //
 199 // Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
 200 // NEON has really excellent alignment handling in hardware, so we would like
 201 // to use that to handle cases where the source is not similarly aligned to the
 202 // destination (it supports even single-byte misalignment at speed).  However,
 203 // on some SoC designs, not all of the DMA busses support such access.  Thus,
 204 // we must unfortunately use a software workaround in those cases.
 205 //
 206 // Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
 207 // we only need to handle the different possible source alignments modulo 4.
 208 // Here we have a dispatch table to jump to the correct copy implementation
 209 // for the given source alignment.
 210 //
 211 // The tbh instruction loads the address offset of the correct implementation
 212 // from the data table that immediately follows it and adds it to the pc to
 213 // jump to the correct branch.
 214
 215     ands     r3,    r1, $3
 216     tbh     [pc, r3, lsl $1]
 217 0:
 218 .short (L_reverseAligned0-0b)/2
 219 .short (L_reverseAligned1-0b)/2
 220 .short (L_reverseAligned2-0b)/2
 221 .short (L_reverseAligned3-0b)/2
 222
 223 /*****************************************************************************
 224  *  source is also at least word aligned                                     *
 225  *****************************************************************************/
 226
 227 L_reverseAligned0:
 228
 229 // Subtract 56 from r2, so that it contains the number of bytes remaining to
 230 // copy minus 64.  If this result is negative, then we jump into a loop that
 231 // copies 8 bytes at a time.
 232
 233     subs     r2,        $0x38
 234     blt      L_reverseVectorCleanup
 235
 236 // Check if the destination pointer is 64-byte aligned.  If so, jump to a loop
 237 // that copies whole cachelines.
 238
 239     tst      ip,        $0x38
 240     beq      L_reverseCachelineAligned
 241
 242 // Otherwise, we copy a 8 bytes at a time, in order of descending memory
 243 // address, until the destination is 64 byte aligned.  Within this loop,
 244 // registers are used as follows:
 245 //
 246 //  r0  original destination pointer
 247 //  r1  pointer to one byte past the next element to be copied
 248 //  r2  (bytes remaining to be copied) - 64
 249 //  ip  pointer one byte past the destination of the next byte to be copied
 250 //  d0  temporary storage for copy
 251 //
 252 //  bytes that will be copied after this iteration
 253 //        |         8 byte block that will be copied in this iteration
 254 //        v         v
 255 //  --------------+-------------------------------+---------------------
 256 //                | 0   1   2   3   4   5   6   7 | bytes already copied
 257 //  --------------+-------------------------------+---------------------
 258 //                                                  ^
 259 //                                                  r1 points here
 260
 261 0:  sub      r1,        $8
 262     vld1.32 {d0},  [r1]
 263     sub      ip,        $8
 264     sub      r2,        $8
 265     tst      ip,        $0x38
 266     vst1.64 {d0},  [ip,:64]
 267     bne      0b
 268
 269 // At this point, the destination pointer is 64 byte aligned.  Check again that
 270 // there are at least 64 bytes remaining to copy by comparing the remaining
 271 // length minus 64 to zero.  If fewer than 64 bytes remain, skip over the main
 272 // copy loop.
 273
 274     cmp      r2,        $0
 275     blt      L_reverseVectorCleanup
 276
 277 /*****************************************************************************
 278  *  destination is cacheline aligned                                         *
 279  *****************************************************************************/
 280
 281 L_reverseCachelineAligned:
 282
 283 // In the special case that we are copying a buffer of between 1k and 32k bytes
 284 // we do not use a NEON copy for the main loop.  This is because if we happen
 285 // to be doing a copy from a source in cache to a destination that is not in
 286 // cache, this will result in an increase in energy usage.  In all other cases,
 287 // NEON gives superior energy conservation.
 288
 289     sub      r3,    r2, $0x3c0
 290     cmp      r3,        $0x7c00
 291     blo      L_useSTMDB
 292
 293 // Pre-decrement the source (r1) and destination (ip) pointers so that they
 294 // point to the first byte of the trailing 32-byte window of each buffer.
 295 // Additionally, load the address increment of -32 into r3.
 296
 297     sub      r1,        $32
 298     sub      ip,        $32
 299     mov      r3,        $-32
 300
 301 // The destination pointer is known to be 64-byte aligned, so we can use the
 302 // maximal alignment hint (:256) for our vector stores.  Detect if the source
 303 // is also at least 32-byte aligned and jump to a loop that uses maximal
 304 // alignment hints for the loads as well if possible.
 305
 306     tst      r1,        $0x1f
 307     beq      L_reverseSourceAligned
 308
 309 // This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
 310 // 64-byte aligned destination, in order of descending memory address.  Within
 311 // this loop, registers are used as follows:
 312 //
 313 //  r0      original destination pointer (unmodified)
 314 //  r1      pointer to the next 32-byte block to load
 315 //  r2      (number of bytes remaining to copy) - 64
 316 //  r3      address increment of -32.
 317 //  ip      pointer to which the next 32-byte block is to be stored
 318 //  q0-q3   temporary registers used for copies
 319 //
 320 // Note that the loop is arrange in such a way that a single cleanup store is
 321 // necessary after the final loop iteration.  This occurs at label (1), and is
 322 // shared between the unaligned and aligned loops.
 323
 324     vld1.32 {q2,q3}, [r1],      r3
 325     vld1.32 {q0,q1}, [r1],      r3
 326     subs     r2,         $64
 327     vst1.64 {q2,q3}, [ip,:256], r3
 328     blt      1f
 329 .align 3
 330 0:  vld1.32 {q2,q3}, [r1],      r3
 331     vst1.64 {q0,q1}, [ip,:256], r3
 332     vld1.32 {q0,q1}, [r1],      r3
 333     subs     r2,         $64
 334     vst1.64 {q2,q3}, [ip,:256], r3
 335     bge      0b
 336     b        1f
 337
 338 L_reverseSourceAligned:
 339
 340 // This loop is identical to the immediately preceeding loop, except that it
 341 // uses the additional alignment hint that the source pointer (r1) is 32-byte
 342 // aligned.  The two loops share cleanup code for the final iteration.
 343
 344     vld1.64 {q2,q3}, [r1,:256], r3
 345     vld1.64 {q0,q1}, [r1,:256], r3
 346     subs     r2,         $64
 347     vst1.64 {q2,q3}, [ip,:256], r3
 348     blt      1f
 349 .align 3
 350 0:  vld1.64 {q2,q3}, [r1,:256], r3
 351     vst1.64 {q0,q1}, [ip,:256], r3
 352     vld1.64 {q0,q1}, [r1,:256], r3
 353     subs     r2,         $64
 354     vst1.64 {q2,q3}, [ip,:256], r3
 355     bge      0b
 356
 357 // Final vector store for both of the above loops.
 358
 359 1:  vst1.64 {q0,q1}, [ip,:256], r3
 360
 361 // Adjust the source and destination pointers so that they once again point to
 362 // the last byte that we used (which is one byte higher than the address that
 363 // we will use next for any required cleanup).
 364
 365     add      r1,         $32
 366     add      ip,         $32
 367
 368 L_reverseVectorCleanup:
 369
 370 // Add 56 to r2, so that it contains the number of bytes remaing to copy minus
 371 // 8.  A comparison of this value with zero tells us if any more whole 8-byte
 372 // blocks need to be copied.
 373
 374     adds     r2,    r2, $0x38
 375     blt      L_scalarReverseCopy
 376
 377 // This loop copies 8 bytes at a time in order of descending memory address,
 378 // until fewer than 8 bytes remain to be copied.  Within this loop, registers
 379 // are used as follows:
 380 //
 381 //  r0  original destination pointer
 382 //  r1  pointer to one byte past the next element to be copied
 383 //  r2  (bytes remaining to be copied) - 64
 384 //  ip  pointer one byte past the destination of the next byte to be copied
 385 //  d0  temporary storage for copy
 386
 387 0:  sub      r1,        $8
 388     vld1.32 {d0},  [r1]
 389     sub      ip,        $8
 390     subs     r2,        $8
 391     vst1.64 {d0},  [ip,:64]
 392     bge      0b
 393
 394 /*****************************************************************************
 395  *  sub-doubleword cleanup copies                                            *
 396  *****************************************************************************/
 397
 398 L_scalarReverseCopy:
 399
 400 // Add 8 to r2, so that it contains the number of bytes remaining to copy, and
 401 // return to the calling routine if zero bytes remain.
 402
 403     adds     r2,        $8
 404     it       eq
 405     bxeq     lr
 406
 407 // Copy one byte at a time in descending address order until we reach the front
 408 // of the buffer.  Within this loop, registers are used as follows:
 409 //
 410 //  r0  original destination pointer
 411 //  r1  pointer to one byte past the next element to be copied
 412 //  r2  (bytes remaining to be copied) - 8
 413 //  r3  temporary to hold the byte that is being copied
 414 //  ip  pointer one byte past the destination of the next byte to be copied
 415
 416 0:  ldrb     r3,   [r1, $-1]!
 417     subs     r2,        $1
 418     strb     r3,   [ip, $-1]!
 419     bne      0b
 420     bx       lr
 421
 422 /*****************************************************************************
 423  *  STMDB loop for 1k-32k buffers                                            *
 424  *****************************************************************************/
 425
 426 // This loop copies 64 bytes each iteration in order of descending memory
 427 // address, using the GPRs instead of NEON.
 428 //
 429 //  r0  original destination pointer
 430 //  r1  pointer to one byte past the next element to be copied
 431 //  r2  (bytes remaining to be copied) - 64
 432 //  r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
 433 //  ip  pointer to one byte past the next location to store to
 434
 435 L_useSTMDB:
 436     push    SAVE_REGISTERS
 437 .align 3
 438 0:  ldmdb   r1!,    COPY_REGISTERS
 439     subs    r2,     r2,  $64
 440     stmdb   ip!,    COPY_REGISTERS
 441     ldmdb   r1!,    COPY_REGISTERS
 442     pld     [r1, $-64]
 443     stmdb   ip!,    COPY_REGISTERS
 444     bge     0b
 445     pop     SAVE_REGISTERS
 446     b       L_reverseVectorCleanup
 447
 448 /*****************************************************************************
 449  *  Misaligned reverse vld1 loop                                             *
 450  *****************************************************************************/
 451
 452 // Software alignment fixup to handle source and dest that are relatively
 453 // misaligned mod 4 bytes.
 454 //
 455 // The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
 456 // which we combine with the 8 bytes loaded in the previous iteration to get a
 457 // 16 byte field; the next 8 bytes to be stored to the destination buffer are
 458 // somewhere in that field, and we get them using the VEXT instruction:
 459 //
 460 //     |  8 bytes from this iteration  |  8 bytes from last iteration  |
 461 //     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 462 //     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
 463 //     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 464 //         ^8 bytes to store this iteration^           |
 465 //                                                   could be a page boundary
 466 //
 467 // We need to be a little bit careful, however.  Because the loads only have 4
 468 // byte alignment, the very first load could slop over into a page that is not
 469 // mapped readable.  In order to prevent this scenario, we copy eight bytes
 470 // using byte-by-byte before beginning the main loop.
 471 //
 472 // At the beginning of each iteration through this loop, registers are used
 473 // as follows:
 474 //
 475 //  r0  original destination pointer
 476 //  r1  pointer to the next block of 8 bytes to load
 477 //  r2  (bytes remaining to copy) - 8
 478 //  ip  pointer to the next block of 8 bytes to store
 479 //  d0  next 8 bytes to store
 480 //  d2  8 bytes loaded in the previous iteration
 481 //  d3  8 bytes loaded two iterations ago
 482
 483 #define RCOPY_UNALIGNED(offset)      \
 484 0:  ldrb      r3,     [r1,$-1]!     ;\
 485     strb      r3,     [ip,$-1]!     ;\
 486     subs      r2,         $1        ;\
 487     blt       L_scalarReverseCopy   ;\
 488     tst       ip,         $7        ;\
 489     bne       0b                    ;\
 490     bic       r1,         $3        ;\
 491     sub       r1,         $8        ;\
 492     sub       ip,         $8        ;\
 493     mov       r3,         $-8       ;\
 494     vld1.32  {d2,d3}, [r1], r3      ;\
 495     subs      r2,         $8        ;\
 496     blt       1f                    ;\
 497 0:  vext.8    d0,  d2, d3, $(offset);\
 498     vmov      d3,      d2           ;\
 499     vld1.32  {d2},    [r1], r3      ;\
 500     subs      r2,          $8       ;\
 501     vst1.64  {d0},    [ip, :64], r3 ;\
 502     bge       0b                    ;\
 503 1:  vext.8    d0,  d2, d3, $(offset);\
 504     add       r1,          $8       ;\
 505     vst1.64  {d0},    [ip, :64]     ;\
 506 2:  add       r1,          $(offset);\
 507     b         L_scalarReverseCopy
 508
 509 L_reverseAligned1:
 510     RCOPY_UNALIGNED(1)
 511 L_reverseAligned2:
 512     RCOPY_UNALIGNED(2)
 513 L_reverseAligned3:
 514     RCOPY_UNALIGNED(3)
 515
 516 /*****************************************************************************
 517  *  front to back copy                                                       *
 518  *****************************************************************************/
 519
 520 L_copyFrontToBack:
 521
 522 // Here the pointers are laid out such that we can use our preferred
 523 // front-to-back copy.  We preserve original destination pointer in r0 because
 524 // it is the return value for the routine, and copy it to ip to use in this
 525 // routine.
 526
 527     mov      ip,    r0
 528
 529 // Subtract 8 from the buffer length; if this is negative, then we will use
 530 // only single-byte copies, and we jump directly to a scalar copy loop.
 531
 532     subs     r2,        $8
 533     blt      L_scalarCopy
 534
 535 // If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
 536 // to move the data.
 537
 538     tst      ip,        $7
 539     beq      L_vectorCopy
 540
 541 // Otherwise, we copy a single byte at a time, in order of ascending memory
 542 // address, until the destination is 8 byte aligned.  Within this loop,
 543 // registers are used as follows:
 544 //
 545 //  r0  original destination pointer
 546 //  r1  pointer to the next byte to copy
 547 //  r2  (bytes remaining to be copied) - 8
 548 //  r3  temporary to hold the byte that is being copied
 549 //  ip  pointer to the next byte to store to
 550
 551 0:  ldrb     r3,  [r1], $1
 552     sub      r2,        $1
 553     strb     r3,  [ip], $1
 554     tst      ip,        $7
 555     bne      0b
 556
 557 // At this point, the destination pointer is 8 byte aligned.  Check again that
 558 // there are at least 8 bytes remaining to copy by comparing the remaining
 559 // length minus 8 to zero.  If fewer than 8 bytes remain, jump to the cleanup
 560 // path.
 561
 562     cmp      r2,        $0
 563     blt      L_scalarCopy
 564
 565 /*****************************************************************************
 566  *  destination is doubleword aligned                                        *
 567  *****************************************************************************/
 568
 569 L_vectorCopy:
 570
 571 // At this point, registers contain the following values:
 572 //
 573 //  r0  original destination pointer
 574 //  r1  pointer to the next element to be copied
 575 //  r2  (bytes remaining to copy) - 8
 576 //  ip  pointer to the destination of the next byte to be copied
 577 //
 578 // Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
 579 // NEON has really excellent alignment handling in hardware, so we would like
 580 // to use that to handle cases where the source is not similarly aligned to the
 581 // destination (it supports even single-byte misalignment at speed).  However,
 582 // on some SoC designs, not all of the DMA busses support such access.  Thus,
 583 // we must unfortunately use a software workaround in those cases.
 584 //
 585 // Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
 586 // we only need to handle the different possible source alignments modulo 4.
 587 // Here we have a dispatch table to jump to the correct copy implementation
 588 // for the given source alignment.
 589 //
 590 // The tbh instruction loads the address offset of the correct implementation
 591 // from the data table that immediately follows it and adds it to the pc to
 592 // jump to the correct branch.
 593
 594     ands     r3,    r1, $3
 595     bic      r1,        $3
 596     tbh     [pc, r3, lsl $1]
 597 0:
 598 .short (L_sourceAligned0-0b)/2
 599 .short (L_sourceAligned1-0b)/2
 600 .short (L_sourceAligned2-0b)/2
 601 .short (L_sourceAligned3-0b)/2
 602
 603 /*****************************************************************************
 604  *  source is also at least word aligned                                     *
 605  *****************************************************************************/
 606
 607 L_sourceAligned0:
 608
 609 // Subtract 56 from r2, so that it contains the number of bytes remaining to
 610 // copy minus 64.  If this result is negative, then we jump into a loop that
 611 // copies 8 bytes at a time.
 612
 613     subs     r2,        $0x38
 614     blt      L_vectorCleanup
 615
 616 // Check if the destination pointer is 64-byte aligned.  If so, jump to a loop
 617 // that copies whole cachelines.
 618
 619     tst      ip,        $0x38
 620     beq      L_cachelineAligned
 621
 622 // Otherwise, we copy a 8 bytes at a time, in order of ascending memory
 623 // address, until the destination is 64 byte aligned.  Within this loop,
 624 // registers are used as follows:
 625 //
 626 //  r0  original destination pointer
 627 //  r1  pointer to the next element to be copied
 628 //  r2  (bytes remaining to be copied) - 64
 629 //  ip  pointer to the destination of the next byte to be copied
 630 //  d0  temporary storage for copy
 631
 632 0:  vld1.32 {d0},  [r1]!
 633     sub      r2,        $8
 634     vst1.64 {d0},  [ip,:64]!
 635     tst      ip,        $0x38
 636     bne      0b
 637
 638 // At this point, the destination pointer is 64 byte aligned.  Check again that
 639 // there are at least 64 bytes remaining to copy by comparing the remaining
 640 // length minus 64 to zero.  If fewer than 64 bytes remain, skip over the main
 641 // copy loop.
 642
 643     cmp      r2,        $0
 644     blt      L_vectorCleanup
 645
 646 /*****************************************************************************
 647  *  destination is cacheline aligned                                         *
 648  *****************************************************************************/
 649
 650 // In the special case that we are copying a buffer of between 1k and 32k bytes
 651 // we do not use a NEON copy for the main loop.  This is because if we happen
 652 // to be doing a copy from a source in cache to a destination that is not in
 653 // cache, this will result in an increase in energy usage.  In all other cases,
 654 // NEON gives superior energy conservation.
 655
 656 L_cachelineAligned:
 657     sub      r3,    r2, $0x3c0
 658     cmp      r3,        $0x7c00
 659     blo      L_useSTMIA
 660
 661 // The destination pointer is known to be 64-byte aligned, so we can use the
 662 // maximal alignment hint (:256) for our vector stores.  Detect if the source
 663 // is also at least 32-byte aligned and jump to a loop that uses maximal
 664 // alignment hints for the loads as well if possible.
 665
 666     tst      r1,        $0x1f
 667     beq      L_sourceAligned32
 668
 669 // This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
 670 // 64-byte aligned destination, in order of ascending memory address.  Within
 671 // this loop, registers are used as follows:
 672 //
 673 //  r0      original destination pointer (unmodified)
 674 //  r1      pointer to the next 32-byte block to load
 675 //  r2      (number of bytes remaining to copy) - 64
 676 //  ip      pointer to which the next 32-byte block is to be stored
 677 //  q0-q3   temporary registers used for copies
 678 //
 679 // Note that the loop is arrange in such a way that a single cleanup store is
 680 // necessary after the final loop iteration.  This occurs at label (1), and is
 681 // shared between the unaligned and aligned loops.
 682
 683     vld1.32 {q2,q3}, [r1]!
 684     vld1.32 {q0,q1}, [r1]!
 685     subs     r2,         $64
 686     vst1.64 {q2,q3}, [ip,:256]!
 687     blt      1f
 688 .align 3
 689 0:  vld1.32 {q2,q3}, [r1]!
 690     vst1.64 {q0,q1}, [ip,:256]!
 691     vld1.32 {q0,q1}, [r1]!
 692     subs     r2,         $64
 693     vst1.64 {q2,q3}, [ip,:256]!
 694     bge      0b
 695     b        1f
 696
 697 L_sourceAligned32:
 698
 699 // This loop is identical to the immediately preceeding loop, except that it
 700 // uses the additional alignment hint that the source pointer (r1) is 32-byte
 701 // aligned.  The two loops share cleanup code for the final iteration.
 702
 703     vld1.64 {q2,q3}, [r1,:256]!
 704     vld1.64 {q0,q1}, [r1,:256]!
 705     subs     r2,         $64
 706     vst1.64 {q2,q3}, [ip,:256]!
 707     blt      1f
 708 .align 3
 709 0:  vld1.64 {q2,q3}, [r1,:256]!
 710     vst1.64 {q0,q1}, [ip,:256]!
 711     vld1.64 {q0,q1}, [r1,:256]!
 712     subs     r2,         $64
 713     vst1.64 {q2,q3}, [ip,:256]!
 714     bge      0b
 715
 716 // Final vector store for both of the above loops.
 717
 718 1:  vst1.64 {q0,q1}, [ip,:256]!
 719
 720 L_vectorCleanup:
 721
 722 // Add 56 to r2, so that it contains the number of bytes remaing to copy minus
 723 // 8.  A comparison of this value with zero tells us if any more whole 8-byte
 724 // blocks need to be copied.
 725
 726     adds     r2,        $0x38
 727     blt      L_scalarCopy
 728
 729 // This loop copies 8 bytes at a time in order of descending memory address,
 730 // until fewer than 8 bytes remain to be copied.  Within this loop, registers
 731 // are used as follows:
 732 //
 733 //  r0  original destination pointer
 734 //  r1  pointer to the next element to be copied
 735 //  r2  (bytes remaining to be copied) - 64
 736 //  ip  pointer to the destination of the next byte to be copied
 737 //  d0  temporary storage for copy
 738
 739 0:  vld1.32 {d0},   [r1]!
 740     subs     r2,        $8
 741     vst1.64 {d0},   [ip,:64]!
 742     bge      0b
 743
 744 /*****************************************************************************
 745  *  sub-doubleword cleanup copies                                            *
 746  *****************************************************************************/
 747
 748 L_scalarCopy:
 749
 750 // Add 8 to r2, so that it contains the number of bytes remaining to copy, and
 751 // return to the calling routine if zero bytes remain.
 752
 753     adds     r2,        $8
 754     it       eq
 755     bxeq     lr
 756
 757 // Copy one byte at a time in descending address order until we reach the front
 758 // of the buffer.  Within this loop, registers are used as follows:
 759 //
 760 //  r0  original destination pointer
 761 //  r1  pointer to one byte past the next element to be copied
 762 //  r2  (bytes remaining to be copied) - 8
 763 //  r3  temporary to hold the byte that is being copied
 764 //  ip  pointer one byte past the destination of the next byte to be copied
 765
 766 0:  ldrb     r3,    [r1], $1
 767     strb     r3,    [ip], $1
 768     subs     r2,          $1
 769     bne      0b
 770     bx       lr
 771
 772 /*****************************************************************************
 773  *  STMIA loop for 1k-32k buffers                                            *
 774  *****************************************************************************/
 775
 776 // This loop copies 64 bytes each iteration in order of ascending memory
 777 // address, using the GPRs instead of NEON.
 778 //
 779 //  r0  original destination pointer
 780 //  r1  pointer to the next element to be copied
 781 //  r2  (bytes remaining to be copied) - 64
 782 //  r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
 783 //  ip  pointer to the next location to store to
 784
 785 L_useSTMIA:
 786     push     SAVE_REGISTERS
 787 .align 3
 788 0:  ldmia   r1!,    COPY_REGISTERS
 789     subs    r2,     r2,  $64
 790     stmia   ip!,    COPY_REGISTERS
 791     ldmia   r1!,    COPY_REGISTERS
 792     pld     [r1, $64]
 793     stmia   ip!,    COPY_REGISTERS
 794     bge     0b
 795     pop     SAVE_REGISTERS
 796     b       L_vectorCleanup
 797
 798 /*****************************************************************************
 799  *  Misaligned forward vld1 loop                                             *
 800  *****************************************************************************/
 801
 802 // Software alignment fixup to handle source and dest that are relatively
 803 // misaligned mod 4 bytes.
 804 //
 805 // The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
 806 // which we combine with the 8 bytes loaded in the previous iteration to get a
 807 // 16 byte field; the next 8 bytes to be stored to the destination buffer are
 808 // somewhere in that field, and we get them using the VEXT instruction:
 809 //
 810 //     |  8 bytes from last iteration  |  8 bytes from this iteration  |
 811 //     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 812 //     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
 813 //     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 814 //         ^8 bytes to store this iteration^           |
 815 //                                                   could be a page boundary
 816 //
 817 // We need to be a little bit careful, however.  Because the loads only have 4
 818 // byte alignment, if we used this approach all the way to the end of the
 819 // buffer, the very last 8 byte load might slop over onto a new page by 4
 820 // bytes, and that new page might not be mapped into our process.  Thus, we
 821 // terminate this copy loop when fewer than 12 bytes remain to be copied,
 822 // instead of the more natural-seeming termination condition of "8 bytes
 823 // remaining" (the illustration above shows the worst case and demonstrates
 824 // why 12 is a sufficiently safe condition).
 825 //
 826 // At the beginning of each iteration through this loop, registers are used
 827 // as follows:
 828 //
 829 //  r0  original destination pointer
 830 //  r1  pointer to the next block of 8 bytes to load
 831 //  r2  (bytes remaining to copy) - 12
 832 //  ip  pointer to the next block of 8 bytes to store
 833 //  d0  next 8 bytes to store
 834 //  d2  8 bytes loaded in the previous iteration
 835 //  d3  8 bytes loaded two iterations ago
 836
 837 #define COPY_UNALIGNED(offset)       \
 838     subs      r2,          $4       ;\
 839     blt       2f                    ;\
 840     vld1.32  {d2,d3}, [r1]!         ;\
 841     subs      r2,          $8       ;\
 842     blt       1f                    ;\
 843 0:  vext.8    d0,  d2, d3, $(offset);\
 844     vmov      d2,      d3           ;\
 845     vld1.32  {d3},    [r1]!         ;\
 846     subs      r2,          $8       ;\
 847     vst1.64  {d0},    [ip, :64]!    ;\
 848     bge       0b                    ;\
 849 1:  vext.8    d0,  d2, d3, $(offset);\
 850     sub       r1,          $8       ;\
 851     vst1.64  {d0},    [ip, :64]!    ;\
 852 2:  add       r1,          $(offset);\
 853     add       r2,          $4       ;\
 854     b         L_scalarCopy
 855
 856 L_sourceAligned1:
 857     COPY_UNALIGNED(1)
 858 L_sourceAligned2:
 859     COPY_UNALIGNED(2)
 860 L_sourceAligned3:
 861     COPY_UNALIGNED(3)
 862
 863 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD