arm/string/NEON/bcopy.s

   1 /*
   2  * Copyright (c) 2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*****************************************************************************
  25  * Cortex-A8 implementation                                                  *
  26  *****************************************************************************/
  27
  28 // Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
  29 //
  30 // Our tests have shown that NEON is always a performance win for memcpy( ).
  31 // However, for the specific case of copies from a warm source to a cold
  32 // destination when the buffer size is between 1k and 32k, it is not enough
  33 // of a performance win to offset the increased power footprint, resulting
  34 // in an energy usage regression.  Thus, we detect that particular case, and
  35 // pass those copies through the ARM core registers.  All other copies larger
  36 // than 8 bytes are handled on NEON.
  37 //
  38 // Stephen Canon, August 2009
  39
  40 .text
  41 .code 16
  42 .syntax unified
  43
  44 // void bcopy(const void * source,
  45 //            void * destination,
  46 //            size_t length);
  47 //
  48 // void *memmove(void * destination,
  49 //               const void * source,
  50 //               size_t n);
  51 //
  52 // void *memcpy(void * restrict destination,
  53 //              const void * restrict source,
  54 //              size_t n);
  55 //
  56 // all copy n successive bytes from source to destination. memmove and memcpy
  57 // returns destination, whereas bcopy has no return value. copying takes place
  58 // as if it were through a temporary buffer -- after return destination contains
  59 // exactly the bytes from source, even if the buffers overlap.
  60
  61 .thumb_func _bcopy
  62 .globl _bcopy
  63 .thumb_func _memmove
  64 .globl _memmove
  65 .thumb_func _memcpy
  66 .globl _memcpy
  67
  68 .align 2
  69 _bcopy:
  70         mov       r3,      r0           // swap the first and second arguments
  71         mov       r0,      r1           // and fall through into memmove
  72         mov       r1,      r3           //
  73
  74 .align 2
  75 _memmove:
  76 _memcpy:
  77     subs      r3,      r0,  r1      // offset = destination addr - source addr
  78     it        eq
  79     bxeq      lr                    // if source == destination, early out
  80
  81 //  Our preference is for using a (faster) front-to-back copy.  However, if
  82 //  0 < offset < length, it is necessary to copy back-to-front for correctness.
  83 //  We have already ruled out offset == 0, so we can use an unsigned compare
  84 //  with length -- if offset is higher, offset is either greater than length
  85 //  or negative.
  86
  87     cmp       r3,      r2
  88     bhs       L_copyFrontToBack
  89
  90 /*****************************************************************************
  91  *  back to front copy                                                       *
  92  *****************************************************************************/
  93
  94     mov       ip,      r0           // copy destination pointer.
  95     add       r1,           r2      // move source pointer to end of source array
  96     add       ip,           r2      // move destination pointer to end of dest array
  97
  98     subs      r2,           $8      // if length - 8 is negative (i.e. length
  99     blt       L_scalarReverseCopy   // is less than 8), jump to cleanup path.
 100     tst       ip,           $7      // if (destination + length) is doubleword
 101     beq       L_vectorReverseCopy   // aligned, jump to fast path.
 102
 103 0:  ldrb      r3,     [r1, $-1]!    // load byte
 104     sub       r2,           $1      // decrement length
 105     strb      r3,     [ip, $-1]!    // store byte
 106     tst       ip,           $7      // test alignment
 107     bne       0b
 108
 109     cmp       r2,           $0      // if length - 8 is negative,
 110     blt       L_scalarReverseCopy   // jump to the cleanup code
 111
 112 /*****************************************************************************
 113  *  destination is doubleword aligned                                        *
 114  *****************************************************************************/
 115
 116 L_vectorReverseCopy:
 117     ands      r3,      r1,  $3      // Extract the alignment of the source
 118     bic       r1,           $3
 119     tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
 120 0:
 121 .short (L_reverseAligned0-0b)/2     // The NEON alignment hardware does not work
 122 .short (L_reverseAligned1-0b)/2     // properly with sub 4-byte alignment and
 123 .short (L_reverseAligned2-0b)/2     // buffers that are uncacheable, so we need
 124 .short (L_reverseAligned3-0b)/2     // to have a software workaround.
 125
 126 /*****************************************************************************
 127  *  source is also at least word aligned                                     *
 128  *****************************************************************************/
 129
 130 L_reverseAligned0:
 131     subs      r2,           $0x38   // if length - 64 is negative, jump to
 132     blt       L_reverseVectorCleanup// the cleanup path.
 133     tst       ip,           $0x38   // if (destination + length) is cacheline
 134     beq       L_reverseCachelineAligned // aligned, jump to the fast path.
 135
 136 0:  sub       r1,           $8      // copy eight bytes at a time until the
 137     vld1.32  {d0},    [r1]          // destination is 8 byte aligned.
 138     sub       ip,           $8      //
 139     sub       r2,           $8      //
 140     tst       ip,           $0x38   //
 141     vst1.64  {d0},    [ip, :64]     //
 142     bne       0b                    //
 143
 144     cmp       r2,           $0      // if length - 64 is negative,
 145     blt       L_reverseVectorCleanup// jump to the cleanup code
 146
 147 L_reverseCachelineAligned:
 148     sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
 149     cmp       r3,          $0x7c00  // register copies instead of NEON to
 150     blo       L_useSTMDB            // control energy usage.
 151
 152     sub       r1,           $32     // decrement source
 153     sub       ip,           $32     // decrement destination
 154     mov       r3,           $-32    // load address increment
 155     tst       r1,           $0x1f   // if source shares 32 byte alignment
 156     beq       L_reverseSourceAligned// jump to loop with more alignment hints
 157
 158     vld1.32  {q2,q3}, [r1], r3      // This loop handles 4-byte aligned copies
 159     vld1.32  {q0,q1}, [r1], r3      // as generally as possible.
 160     subs      r2,           $64     //
 161     vst1.64  {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always
 162     blt       1f                    // properly handle misalignment in vld1
 163 .align 3                            // with an element size of 8 or 16, so
 164 0:  vld1.32  {q2,q3}, [r1], r3      // this is the best we can do without
 165     vst1.64  {q0,q1}, [ip,:256], r3 // handling alignment in software.
 166     vld1.32   {q0,q1}, [r1], r3     //
 167     subs      r2,           $64     //
 168     vst1.64  {q2,q3}, [ip,:256], r3 //
 169     bge       0b                    //
 170     b         1f                    //
 171
 172 L_reverseSourceAligned:
 173     vld1.64  {q2,q3}, [r1,:256], r3 // Identical to loop above except for
 174     vld1.64  {q0,q1}, [r1,:256], r3 // additional alignment information; this
 175     subs      r2,           $64     // gets an additional .5 bytes per cycle
 176     vst1.64  {q2,q3}, [ip,:256], r3 // on Cortex-A8.
 177     blt       1f                    //
 178 .align 3                            //
 179 0:  vld1.64  {q2,q3}, [r1,:256], r3 //
 180     vst1.64  {q0,q1}, [ip,:256], r3 //
 181     vld1.64  {q0,q1}, [r1,:256], r3 //
 182     subs      r2,           $64     //
 183     vst1.64  {q2,q3}, [ip,:256], r3 //
 184     bge       0b                    //
 185 1:  vst1.64  {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store
 186     add       r1,           $32     // point source at last element stored
 187     add       ip,           $32     // point destination at last element stored
 188
 189 L_reverseVectorCleanup:
 190     adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
 191     blt       L_scalarReverseCopy   //
 192
 193 0:  sub       r1,           $8      // copy eight bytes at a time until
 194     vld1.32  {d0},    [r1]          // (length - 8) < 0.
 195     sub       ip,           $8      //
 196     subs      r2,           $8      //
 197     vst1.64  {d0},    [ip, :64]     //
 198     bge       0b                    //
 199
 200 /*****************************************************************************
 201  *  sub-doubleword cleanup copies                                            *
 202  *****************************************************************************/
 203
 204 L_scalarReverseCopy:
 205     adds      r2,           #0x8    // restore length
 206     it        eq                    // if this is zero
 207     bxeq      lr                    // early out
 208
 209 0:  ldrb      r3,     [r1, #-1]!    // load a byte from source
 210     strb      r3,     [ip, #-1]!    // store to destination
 211     subs      r2,           #0x1    // subtract one from length
 212     bne       0b                    // if non-zero, repeat
 213     bx        lr                    // return
 214
 215 /*****************************************************************************
 216  *  STMDB loop for 1k-32k buffers                                            *
 217  *****************************************************************************/
 218
 219 L_useSTMDB:
 220     push     {r4-r8,r10,r11}
 221 .align 3
 222 0:  ldmdb         r1!,  {r3-r8,r10,r11}
 223     subs      r2,           #0x40
 224     stmdb     ip!,  {r3-r8,r10,r11}
 225     ldmdb         r1!,  {r3-r8,r10,r11}
 226         pld              [r1, #-0x40]
 227     stmdb     ip!,  {r3-r8,r10,r11}
 228     bge       0b
 229     pop      {r4-r8,r10,r11}
 230     b         L_reverseVectorCleanup
 231
 232 /*****************************************************************************
 233  *  Misaligned vld1 loop                                                     *
 234  *****************************************************************************/
 235
 236 // Software alignment fixup to handle source and dest that are relatively
 237 // misaligned mod 4 bytes.  Load two 4-byte aligned double words from source,
 238 // use vext.8 to extract a double word to store, and perform an 8-byte aligned
 239 // store to destination.
 240
 241 #define RCOPY_UNALIGNED(offset)      \
 242     subs      r2,          $8       ;\
 243     blt       2f                    ;\
 244     sub       r1,          $8       ;\
 245     sub       ip,          $8       ;\
 246     mov       r3,          $-8      ;\
 247     vld1.32  {d2,d3}, [r1], r3      ;\
 248     subs      r2,          $8       ;\
 249     blt       1f                    ;\
 250 0:  vext.8    d0,  d2, d3, $(offset);\
 251     vmov      d3,      d2           ;\
 252     vld1.32  {d2},    [r1], r3      ;\
 253     subs      r2,          $8       ;\
 254     vst1.64  {d0},    [ip, :64], r3 ;\
 255     bge       0b                    ;\
 256 1:  vext.8    d0,  d2, d3, $(offset);\
 257     add       r1,          $8       ;\
 258     vst1.64  {d0},    [ip, :64]     ;\
 259 2:  add       r2,          $8       ;\
 260     add       r1,          $(offset);\
 261     b         L_scalarReverseCopy
 262
 263 L_reverseAligned1:
 264     RCOPY_UNALIGNED(1)
 265 L_reverseAligned2:
 266     RCOPY_UNALIGNED(2)
 267 L_reverseAligned3:
 268     RCOPY_UNALIGNED(3)
 269
 270 /*****************************************************************************
 271  *  front to back copy                                                       *
 272  *****************************************************************************/
 273
 274 L_copyFrontToBack:
 275     mov       ip,      r0           // copy destination pointer.
 276     subs      r2,           $8      // if length - 8 is negative (i.e. length
 277     blt       L_scalarCopy          // is less than 8), jump to cleanup path.
 278     tst       ip,           $7      // if the destination is doubleword
 279     beq       L_vectorCopy          // aligned, jump to fast path.
 280
 281 0:  ldrb      r3,     [r1], $1      // load byte
 282     sub       r2,           $1      // decrement length
 283     strb      r3,     [ip], $1      // store byte
 284     tst       ip,           $7      // test alignment
 285     bne       0b
 286
 287     cmp       r2,           $0      // if length - 8 is negative,
 288     blt       L_scalarCopy          // jump to the cleanup code
 289
 290 /*****************************************************************************
 291  *  destination is doubleword aligned                                        *
 292  *****************************************************************************/
 293
 294 L_vectorCopy:
 295     ands      r3,      r1,  $3      // Extract the alignment of the source
 296     bic       r1,           $3
 297     tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
 298 0:
 299 .short (L_sourceAligned0-0b)/2      // The NEON alignment hardware does not work
 300 .short (L_sourceAligned1-0b)/2      // properly with sub 4-byte alignment and
 301 .short (L_sourceAligned2-0b)/2      // buffers that are uncacheable, so we need
 302 .short (L_sourceAligned3-0b)/2      // to have a software workaround.
 303
 304 /*****************************************************************************
 305  *  source is also at least word aligned                                     *
 306  *****************************************************************************/
 307
 308 L_sourceAligned0:
 309     subs      r2,           $0x38   // If (length - 64) < 0
 310     blt       L_vectorCleanup       //   jump to cleanup code
 311     tst       ip,           $0x38   // If destination is 64 byte aligned
 312     beq       L_cachelineAligned    //   jump to main loop
 313
 314 0:  vld1.32  {d0},    [r1]!         // Copy one double word at a time until
 315     sub       r2,           $8      // the destination is 64-byte aligned.
 316     vst1.64  {d0},    [ip, :64]!    //
 317     tst       ip,           $0x38   //
 318     bne       0b                    //
 319
 320     cmp       r2,           $0      // If (length - 64) < 0, goto cleanup
 321     blt       L_vectorCleanup       //
 322
 323 L_cachelineAligned:
 324     sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
 325     cmp       r3,          $0x7c00  // register copies instead of NEON to
 326     blo       L_useSTMIA            // control energy usage.
 327     tst       r1,           $0x1f   // If source has 32-byte alignment, use
 328     beq       L_sourceAligned32     // an optimized loop.
 329
 330     vld1.32  {q2,q3}, [r1]!         // This is the most common path for small
 331     vld1.32  {q0,q1}, [r1]!         // copies, which are alarmingly frequent.
 332     subs      r2,           #0x40   // It requires 4-byte alignment on the
 333     vst1.64  {q2,q3}, [ip, :256]!   // source.  For ordinary malloc'd buffers,
 334     blt       1f                    // this path could handle only single-byte
 335 .align 3                            // alignment at speed by using vld1.8
 336 0:  vld1.32  {q2,q3}, [r1]!         // instead of vld1.32; however, the NEON
 337     vst1.64  {q0,q1}, [ip, :256]!   // alignment handler misbehaves for some
 338     vld1.32  {q0,q1}, [r1]!         // special copies if the element size is
 339     subs      r2,           #0x40   // 8 or 16, so we need to work around
 340     vst1.64  {q2,q3}, [ip, :256]!   // sub 4-byte alignment in software, in
 341     bge       0b                    // another code path.
 342     b         1f
 343
 344 L_sourceAligned32:
 345     vld1.64  {q2,q3}, [r1, :256]!   // When the source shares 32-byte alignment
 346     vld1.64  {q0,q1}, [r1, :256]!   // with the destination, we use this loop
 347     subs      r2,           #0x40   // instead, which specifies the maximum
 348     vst1.64  {q2,q3}, [ip, :256]!   // :256 alignment on all loads and stores.
 349     blt       1f                    //
 350 .align 3                            // This gets an additional .5 bytes per
 351 0:  vld1.64  {q2,q3}, [r1, :256]!   // cycle for in-cache copies, which is not
 352     vst1.64  {q0,q1}, [ip, :256]!   // insignificant for this (rather common)
 353     vld1.64  {q0,q1}, [r1, :256]!   // case.
 354     subs      r2,           #0x40   //
 355     vst1.64  {q2,q3}, [ip, :256]!   // This is identical to the above loop,
 356     bge       0b                    // except for the additional alignment.
 357 1:  vst1.64  {q0,q1}, [ip, :256]!   //
 358
 359 L_vectorCleanup:
 360     adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
 361     blt       L_scalarCopy          //
 362
 363 0:  vld1.32  {d0},    [r1]!         // Copy one doubleword at a time until
 364     subs      r2,           $8      // (length - 8) < 0.
 365     vst1.64  {d0},    [ip, :64]!    //
 366     bge       0b                    //
 367
 368 /*****************************************************************************
 369  *  sub-doubleword cleanup copies                                            *
 370  *****************************************************************************/
 371
 372 L_scalarCopy:
 373     adds      r2,           #0x8    // restore length
 374     it        eq                    // if this is zero
 375     bxeq      lr                    // early out
 376
 377 0:  ldrb      r3,     [r1], #1      // load a byte from source
 378     strb      r3,     [ip], #1      // store to destination
 379     subs      r2,           #1      // subtract one from length
 380     bne       0b                    // if non-zero, repeat
 381     bx        lr                    // return
 382
 383 /*****************************************************************************
 384  *  STMIA loop for 1k-32k buffers                                            *
 385  *****************************************************************************/
 386
 387 L_useSTMIA:
 388     push     {r4-r8,r10,r11}
 389 .align 3
 390 0:  ldmia     r1!,  {r3-r8,r10,r11}
 391     subs      r2,      r2,  #64
 392     stmia     ip!,  {r3-r8,r10,r11}
 393     ldmia     r1!,  {r3-r8,r10,r11}
 394     pld      [r1, #64]
 395     stmia     ip!,  {r3-r8,r10,r11}
 396     bge       0b
 397     pop      {r4-r8,r10,r11}
 398     b         L_vectorCleanup
 399
 400 /*****************************************************************************
 401  *  Misaligned reverse vld1 loop                                             *
 402  *****************************************************************************/
 403
 404 // Software alignment fixup to handle source and dest that are relatively
 405 // misaligned mod 4 bytes.  Load two 4-byte aligned double words from source,
 406 // use vext.8 to extract a double word to store, and perform an 8-byte aligned
 407 // store to destination.
 408
 409 #define COPY_UNALIGNED(offset)       \
 410     subs      r2,          $8       ;\
 411     blt       2f                    ;\
 412     vld1.32  {d2,d3}, [r1]!         ;\
 413     subs      r2,          $8       ;\
 414     blt       1f                    ;\
 415 0:  vext.8    d0,  d2, d3, $(offset);\
 416     vmov      d2,      d3           ;\
 417     vld1.32  {d3},    [r1]!         ;\
 418     subs      r2,          $8       ;\
 419     vst1.64  {d0},    [ip, :64]!    ;\
 420     bge       0b                    ;\
 421 1:  vext.8    d0,  d2, d3, $(offset);\
 422     sub       r1,          $8       ;\
 423     vst1.64  {d0},    [ip, :64]!    ;\
 424 2:  add       r1,          $(offset);\
 425     add       r2,          $8       ;\
 426     b         L_scalarCopy
 427
 428 L_sourceAligned1:
 429     COPY_UNALIGNED(1)
 430 L_sourceAligned2:
 431     COPY_UNALIGNED(2)
 432 L_sourceAligned3:
 433     COPY_UNALIGNED(3)