arm/string/bcopy_Swift.s

   1 /*
   2  * Copyright (c) 2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  *
  23  *  This file implements the following functions for the Swift micro-arch:
  24  *
  25  *  void bcopy(const void * source,
  26  *             void * destination,
  27  *             size_t length);
  28  *
  29  *  void *memmove(void * destination,
  30  *                const void * source,
  31  *                size_t n);
  32  *
  33  *  void *memcpy(void * restrict destination,
  34  *               const void * restrict source,
  35  *               size_t n);
  36  *
  37  * All copy n successive bytes from source to destination.  Memmove and memcpy
  38  * return destination, whereas bcopy has no return value.  Copying takes place
  39  * as if it were through a temporary buffer -- after return destination
  40  * contains exactly the bytes from source, even if the buffers overlap (this is
  41  * not required of memcpy by the C standard; its behavior is undefined if the
  42  * buffers overlap, but we are holding ourselves to the historical behavior of
  43  * this function on OS X and iOS).
  44  */
  45
  46 #include <arm/arch.h>
  47 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
  48
  49 .syntax unified
  50 .code 16
  51 .globl _bcopy$VARIANT$Swift
  52 .thumb_func _bcopy$VARIANT$Swift
  53 .globl _memmove$VARIANT$Swift
  54 .thumb_func _memmove$VARIANT$Swift
  55 .globl _memcpy$VARIANT$Swift
  56 .thumb_func _memcpy$VARIANT$Swift
  57
  58 .text
  59 .align 4
  60 _bcopy$VARIANT$Swift:
  61 //  Translate bcopy calls into memcpy calls by swapping the first and second
  62 //  arguments.
  63     mov     r3,     r0
  64     mov     r0,     r1
  65     mov     r1,     r3
  66
  67 _memmove$VARIANT$Swift:
  68 _memcpy$VARIANT$Swift:
  69 //  Our preference is to copy the data in ascending address order, but if the
  70 //  buffers overlap such that the beginning of the destination buffer aliases
  71 //  the end of the source buffer, we need to copy in descending address order
  72 //  instead to preserve the memmove semantics.  We detect this case with the
  73 //  test:
  74 //
  75 //      destination - source < length    (unsigned compare)
  76 //
  77 //  If the address of the source buffer is higher than the address of the
  78 //  destination buffer, this arithmetic can overflow, but the overflowed value
  79 //  can only be smaller than length if the buffers do not overlap, so we don't
  80 //  need to worry about false positives due to the overflow (they happen, but
  81 //  only in cases where copying in either order is correct).
  82     push    {r7,lr}
  83     mov     r7,         sp
  84     subs    r3,     r0, r1
  85     beq     L_exit
  86     mov     ip,         r0
  87     cmp     r3,         r2
  88     blo     L_descendingCopy
  89
  90 /*****************************************************************************
  91  *  Ascending copy                                                           *
  92  *****************************************************************************/
  93
  94     subs    r3,     r2, #32     //  If length < 32, jump to a dedicated code
  95     blo     L_ascendingShort    //  path for short buffers.
  96
  97     orr     lr,     r0, r1      //  If the length is not a multiple of 16, or
  98     orr     lr,         r2      //  either buffer is not 16-byte aligned, then
  99     ands    lr,         #0xf    //  some edging is needed; jump to a separate
 100     bne     L_ascendingEdging   //  branch to handle it.
 101
 102 /*****************************************************************************
 103  *  Ascending vector aligned copy                                            *
 104  *****************************************************************************/
 105
 106 0:  subs    r3,         #32     //  Copy 32 bytes at a time from src to dst,
 107     vld1.8  {q0,q1}, [r1,:128]! //  both of which have 16-byte alignment.
 108     vst1.8  {q0,q1}, [ip,:128]! //  Terminate this loop when 32 or fewer bytes
 109     bhi     0b                  //  remain to be copied.
 110
 111     add     r1,         r3      //  Backtrack both pointers by 32 - remaining
 112     vld1.8  {q0,q1}, [r1,:128]  //  and copy 32 bytes from src to dst.  This
 113     add     ip,         r3      //  copy may overlap the previous copy, and
 114     vst1.8  {q0,q1}, [ip,:128]  //  takes us precisely to the end of the
 115     pop     {r7,pc}             //  buffer.
 116
 117 /*****************************************************************************
 118  *  Ascending vector misaligned copy                                         *
 119  *****************************************************************************/
 120
 121 L_ascendingEdging:
 122     tst     ip,         #0xf    //  Copy one byte at a time until the
 123     itttt   ne                  //  destination pointer has 16 byte alignment.
 124     ldrbne  r3,    [r1],#1
 125     strbne  r3,    [ip],#1
 126     subne   r2,         #1
 127     bne     L_ascendingEdging
 128
 129     and     lr,     r1, #0xf    //  Back the source pointer up to a 16-byte
 130     bic     r1,         #0xf    //  aligned location, and check if length > 32.
 131     subs    r3,     r2, #32
 132     blo     L_ascendingEdgingExit
 133     tbh    [pc, lr, lsl #1]     //  Otherwise, we have a jump table based on
 134 0:                              //  the relative alignment of the buffers.
 135 .short (L_ascendingExtract0x0-0b)/2
 136 .short (L_ascendingExtract0x1-0b)/2
 137 .short (L_ascendingExtract0x2-0b)/2
 138 .short (L_ascendingExtract0x3-0b)/2
 139 .short (L_ascendingExtract0x4-0b)/2
 140 .short (L_ascendingExtract0x5-0b)/2
 141 .short (L_ascendingExtract0x6-0b)/2
 142 .short (L_ascendingExtract0x7-0b)/2
 143 .short (L_ascendingExtract0x8-0b)/2
 144 .short (L_ascendingExtract0x9-0b)/2
 145 .short (L_ascendingExtract0xa-0b)/2
 146 .short (L_ascendingExtract0xb-0b)/2
 147 .short (L_ascendingExtract0xc-0b)/2
 148 .short (L_ascendingExtract0xd-0b)/2
 149 .short (L_ascendingExtract0xe-0b)/2
 150 .short (L_ascendingExtract0xf-0b)/2
 151
 152 L_ascendingExtract0x0:          //  If the two buffers are similarly aligned,
 153     subs    r3,         #32     //  we use a slightly simpler loop that just
 154     vld1.8  {q0,q1}, [r1,:128]! //  copies 32 bytes at a time.
 155     vst1.8  {q0,q1}, [ip,:128]!
 156     bhs     L_ascendingExtract0x0
 157     b       L_ascendingEdgingExit
 158
 159 #define ASCENDING_EXTRACT(shift)\
 160 L_ascendingExtract ## shift:\
 161     vld1.8  {q8},    [r1,:128]!;\
 162 0:  vld1.8  {q9,q10},[r1,:128]!;\
 163     vext.8  q0, q8, q9, $(shift);\
 164     vext.8  q1, q9, q10,$(shift);\
 165     vmov    q8,         q10;\
 166     vst1.8  {q0,q1}, [ip,:128]!;\
 167     subs    r3,         $32;\
 168     bhs     0b;\
 169     sub     r1,         $16;\
 170     b       L_ascendingEdgingExit
 171
 172 ASCENDING_EXTRACT(0x1)          //  Otherwise, we use the loop implemented in
 173 ASCENDING_EXTRACT(0x2)          //  the above macro.  It loads 32 bytes per
 174 ASCENDING_EXTRACT(0x3)          //  iteration combines it with the residual
 175 ASCENDING_EXTRACT(0x4)          //  bytes from the previous iteration, and
 176 ASCENDING_EXTRACT(0x5)          //  uses the VEXT instruction to extract 32
 177 ASCENDING_EXTRACT(0x6)          //  bytes that can be stored to a 16-byte
 178 ASCENDING_EXTRACT(0x7)          //  aligned location in the destination buffer.
 179 ASCENDING_EXTRACT(0x8)          //  This continues until 32 or fewer bytes
 180 ASCENDING_EXTRACT(0x9)          //  remain to be copied.  This is significantly
 181 ASCENDING_EXTRACT(0xa)          //  faster than using misaligned loads and
 182 ASCENDING_EXTRACT(0xb)          //  stores, which are very inefficient on
 183 ASCENDING_EXTRACT(0xc)          //  Swift.
 184 ASCENDING_EXTRACT(0xd)
 185 ASCENDING_EXTRACT(0xe)
 186 ASCENDING_EXTRACT(0xf)
 187
 188 L_ascendingEdgingExit:
 189     add     r1,         lr      //  Restore the source pointer
 190     add     r2,     r3, #32     //  Restore the length
 191 L_ascendingShort:
 192     subs    r2,         #1      //  Copy one byte at a time until the buffer
 193     itt     hs                  //  is exhausted, then return.
 194     ldrbhs  r3,    [r1],#1
 195     strbhs  r3,    [ip],#1
 196     bhi     L_ascendingShort
 197 L_exit:
 198     pop     {r7,pc}
 199
 200 /*****************************************************************************
 201  *  Descending copy                                                          *
 202  *****************************************************************************/
 203
 204 L_descendingCopy:
 205     add     r1,         r2      //  Advance source and destination pointers to
 206     add     ip,         r2      //  the end of the buffer.
 207
 208     subs    r3,     r2, #32     //  If length < 32, jump to a dedicated code
 209     blo     L_descendingShort   //  path for short buffers.
 210
 211     orr     lr,     r0, r1      //  If the length is not a multiple of 16, or
 212     orr     lr,         r2      //  either buffer is not 16-byte aligned, then
 213     ands    lr,         #0xf    //  some edging is needed; jump to a separate
 214     bne     L_descendingEdging  //  branch to handle it.
 215
 216 /*****************************************************************************
 217  *  Descending vector aligned copy                                            *
 218  *****************************************************************************/
 219
 220 0:  sub     r1,         #32     //  Copies 32 bytes (16-byte aligned) from
 221     vld1.8  {q0,q1}, [r1,:128]  //  source to destination on each pass through
 222     sub     ip,         #32     //  the loop.  The loop ends when 32 or fewer
 223     vst1.8  {q0,q1}, [ip,:128]  //  bytes remain to be copied.
 224     subs    r3,         #32
 225     bhi     0b
 226     add     r3,         #32     //  Copy the remaining up-to-32 bytes.
 227     sub     r1,         r3      //  This copy may overlap the copy performed
 228     vld1.8  {q0,q1}, [r1,:128]  //  in the final iteration through the
 229     sub     ip,         r3      //  previous loop, but this is more efficient
 230     vst1.8  {q0,q1}, [ip,:128]  //  than figuring out exactly which bytes
 231     pop     {r7,pc}             //  need to be copied.
 232
 233 /*****************************************************************************
 234  *  Descending vector misaligned copy                                        *
 235  *****************************************************************************/
 236
 237 L_descendingEdging:
 238     tst     ip,         #0xf    //  Identical to how we handle misalignment for
 239     itttt   ne                  //  ascending copies.  First we move one byte
 240     ldrbne  r3,    [r1,#-1]!    //  at a time until the destination has 16
 241     strbne  r3,    [ip,#-1]!    //  byte alignment.
 242     subne   r2,         #1
 243     bne     L_descendingEdging
 244
 245     and     lr,     r1, #0xf    //  Then we extract the alignment of the source
 246     bic     r1,         #0xf    //  buffer and use a jump table to dispatch
 247     subs    r3,     r2, #32     //  into code that does the appropriate
 248     blo     L_descendingEdgingExit  //  software alignment fixup.
 249     tbh    [pc, lr, lsl #1]
 250 0:
 251 .short (L_descendingExtract0x0-0b)/2
 252 .short (L_descendingExtract0x1-0b)/2
 253 .short (L_descendingExtract0x2-0b)/2
 254 .short (L_descendingExtract0x3-0b)/2
 255 .short (L_descendingExtract0x4-0b)/2
 256 .short (L_descendingExtract0x5-0b)/2
 257 .short (L_descendingExtract0x6-0b)/2
 258 .short (L_descendingExtract0x7-0b)/2
 259 .short (L_descendingExtract0x8-0b)/2
 260 .short (L_descendingExtract0x9-0b)/2
 261 .short (L_descendingExtract0xa-0b)/2
 262 .short (L_descendingExtract0xb-0b)/2
 263 .short (L_descendingExtract0xc-0b)/2
 264 .short (L_descendingExtract0xd-0b)/2
 265 .short (L_descendingExtract0xe-0b)/2
 266 .short (L_descendingExtract0xf-0b)/2
 267
 268 L_descendingExtract0x0:         //  For relative alignment, we have a fast
 269     sub     r1,         #32     //  path identical to the aligned copy loop.
 270     vld1.8  {q0,q1}, [r1,:128]
 271     sub     ip,         #32
 272     vst1.8  {q0,q1}, [ip,:128]
 273     subs    r3,         #32
 274     bhs     L_descendingExtract0x0
 275     b       L_descendingEdgingExit
 276
 277 #define DESCENDING_EXTRACT(shift)\
 278 L_descendingExtract ## shift:\
 279     vld1.8  {q10},    [r1,:128];\
 280 0:  sub     r1,         #32;\
 281     vld1.8  {q8,q9},  [r1,:128];\
 282     vext.8  q1, q9, q10,$(shift);\
 283     vext.8  q0, q8, q9, $(shift);\
 284     vmov    q10,        q8;\
 285     sub     ip,         #32;\
 286     vst1.8  {q0,q1}, [ip,:128];\
 287     subs    r3,         $32;\
 288     bhs     0b;\
 289     b       L_descendingEdgingExit
 290
 291 DESCENDING_EXTRACT(0x1)         //  Otherwise, we use the loop above (almost
 292 DESCENDING_EXTRACT(0x2)         //  identical to the one we use in the
 293 DESCENDING_EXTRACT(0x3)         //  ascending copy case).
 294 DESCENDING_EXTRACT(0x4)
 295 DESCENDING_EXTRACT(0x5)
 296 DESCENDING_EXTRACT(0x6)
 297 DESCENDING_EXTRACT(0x7)
 298 DESCENDING_EXTRACT(0x8)
 299 DESCENDING_EXTRACT(0x9)
 300 DESCENDING_EXTRACT(0xa)
 301 DESCENDING_EXTRACT(0xb)
 302 DESCENDING_EXTRACT(0xc)
 303 DESCENDING_EXTRACT(0xd)
 304 DESCENDING_EXTRACT(0xe)
 305 DESCENDING_EXTRACT(0xf)
 306
 307 L_descendingEdgingExit:
 308     add     r1,         lr      //  Restore source pointer
 309     add     r2,     r3, #32     //  Restore length
 310 L_descendingShort:
 311     subs    r2,         #1      //  Byte-by-byte copy loop for short overlapping
 312     itt     hs                  //  buffers.
 313     ldrbhs  r3,    [r1,#-1]!
 314     strbhs  r3,    [ip,#-1]!
 315     bhi     L_descendingShort
 316     pop     {r7,pc}
 317
 318 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD