osfmk/arm64/bcopy.s

   1 /*
   2  * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  *  This file implements the following functions for the arm64 architecture.
  29  *
  30  *  void bcopy(const void * source,
  31  *             void * destination,
  32  *             size_t length);
  33  *
  34  *  void *memmove(void * destination,
  35  *                const void * source,
  36  *                size_t n);
  37  *
  38  *  void *memcpy(void * restrict destination,
  39  *               const void * restrict source,
  40  *               size_t n);
  41  *
  42  * All copy n successive bytes from source to destination.  Memmove and memcpy
  43  * return destination, whereas bcopy has no return value.  Copying takes place
  44  * as if it were through a temporary buffer -- after return destination
  45  * contains exactly the bytes from source, even if the buffers overlap (this is
  46  * not required of memcpy by the C standard; its behavior is undefined if the
  47  * buffers overlap, but we are holding ourselves to the historical behavior of
  48  * this function on MacOS).
  49  */
  50
  51 #include "asm.h"
  52
  53 .globl _bcopy
  54 .globl _ovbcopy
  55 .globl _memcpy
  56 .globl _memmove
  57
  58 /*****************************************************************************
  59  *  Macros                                                                   *
  60  *****************************************************************************/
  61
  62 #define kSmallCopy 64
  63
  64 /*****************************************************************************
  65  *  Entrypoints                                                              *
  66  *****************************************************************************/
  67
  68 .text
  69 .align 5
  70 _bcopy:
  71 _ovbcopy:
  72 //  Translate bcopy into memcpy by swapping the first and second arguments.
  73         mov     x3,      x0
  74         mov     x0,      x1
  75         mov     x1,      x3
  76
  77 .align 4
  78 _memcpy:
  79 _memmove:
  80 //      Our preference is to copy the data in ascending address order, but if the
  81 //      buffers overlap such that the beginning of the destination buffer aliases
  82 //      the end of the source buffer, we need to copy in descending address order
  83 //      instead to preserve the memmove semantics.  We detect this case with the
  84 //      test:
  85 //
  86 //          destination - source < length    (unsigned compare)
  87 //
  88 //      If the address of the source buffer is higher than the address of the
  89 //      destination buffer, this arithmetic can overflow, but the overflowed value
  90 //      can only be smaller than length if the buffers do not overlap, so we don't
  91 //      need to worry about false positives due to the overflow (they happen, but
  92 //      only in cases where copying in either order is correct).
  93         ARM64_STACK_PROLOG
  94         PUSH_FRAME
  95         sub     x3,      x0, x1
  96         cmp     x3,      x2
  97         b.cc    L_reverse
  98         mov     x3,      x0      // copy destination pointer
  99         cmp     x2,      #(kSmallCopy)
 100         b.cc    L_forwardSmallCopy
 101
 102 /*****************************************************************************
 103  *  Forward large copy                                                       *
 104  *****************************************************************************/
 105
 106 //      Load the first 32 bytes from src, and compute the number of bytes to the
 107 //      first 32-byte aligned location in dst.  Even though we are going to copy
 108 //      32 bytes, only those preceeding that 32-byte location "count" towards
 109 //      reducing the length of the buffer or advancing the pointers.  We will need
 110 //      to issue the first load from the advanced src pointer BEFORE the store to
 111 //      the unmodified dst pointer.
 112         add     x3,      x3, #32
 113         and     x3,      x3, #-32 // aligned dst
 114         ldp     x12,x13,[x1]
 115         ldp     x14,x15,[x1, #16]
 116         sub     x5,      x3, x0   // bytes between original dst and aligned dst
 117         add     x1,      x1, x5   // update src pointer
 118
 119 //      At this point, data in the following registers is in flight:
 120 //
 121 //              x0    original dst pointer
 122 //              x1    corresponding location in src buffer.
 123 //              x2    length from aligned location in dst to end of buffer.  This is
 124 //                    guaranteed to be >= (64 - 32).
 125 //              x3    aligned location in dst buffer.
 126 //              x12:x15 first 32 bytes of src buffer.
 127 //
 128 //      We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3.  The
 129 //      store *may* overlap the first 32 bytes of the load, so in order to get
 130 //      correct memmove semantics, the first 32 byte load must occur before the
 131 //      store.
 132 //
 133 //      After loading these 32 bytes, we advance x1, and decrement the length by
 134 //      64.  If the remaining length of the buffer was less than 64, then we jump
 135 //      directly to the cleanup path.
 136         ldp     x8, x9, [x1]
 137         ldp     x10,x11,[x1, #16]
 138         add     x1,      x1, #32
 139         sub     x2,      x2, x5   // update length
 140         stp     x12,x13,[x0]      // initial unaligned store
 141         stp     x14,x15,[x0, #16] // initial unaligned store
 142         subs    x2,      x2, #64
 143         b.ls    L_forwardCleanup
 144
 145 L_forwardCopyLoop:
 146 //      Main copy loop:
 147 //
 148 //              1. store the 32 bytes loaded in the previous loop iteration
 149 //              2. advance the destination pointer
 150 //              3. load the next 32 bytes
 151 //              4. advance the source pointer
 152 //              5. subtract 32 from the length
 153 //
 154 //      The loop is terminated when 32 or fewer bytes remain to be loaded.  Those
 155 //      trailing 1-32 bytes will be copied in the loop cleanup.
 156         stnp    x8, x9, [x3]
 157         stnp    x10,x11,[x3, #16]
 158         add     x3,      x3, #32
 159         ldnp    x8, x9, [x1]
 160         ldnp    x10,x11,[x1, #16]
 161         add     x1,      x1, #32
 162         subs    x2,      x2, #32
 163         b.hi    L_forwardCopyLoop
 164
 165 L_forwardCleanup:
 166 //      There are 32 bytes in x8-x11 that were loaded in the previous loop
 167 //      iteration, which need to be stored to [x3,x3+32).  In addition, between
 168 //  0 and 32 more bytes need to be copied from x1 to x3 + 32.  The exact
 169 //      number of bytes to copy is x2 + 32.  Instead of using smaller conditional
 170 //      copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
 171 //      This copy may overlap with the first store, so the loads must come before
 172 //      the store of the data from the previous loop iteration.
 173         add     x1,      x1, x2
 174         ldp     x12,x13,[x1]
 175         ldp     x14,x15,[x1, #16]
 176         stp     x8, x9, [x3]
 177         stp     x10,x11,[x3, #16]
 178         add     x3,      x3, x2
 179         stp     x12,x13,[x3, #32]
 180         stp     x14,x15,[x3, #48]
 181         POP_FRAME
 182         ARM64_STACK_EPILOG
 183
 184 /*****************************************************************************
 185  *  forward small copy                                                       *
 186  *****************************************************************************/
 187
 188 //      Copy one quadword at a time until less than 8 bytes remain to be copied.
 189 //      At the point of entry to L_forwardSmallCopy, the "calling convention"
 190 //      is as follows:
 191 //
 192 //        x0     pointer to first byte of destination
 193 //        x1     pointer to first byte of source
 194 //        x2     length of buffers
 195 //        x3     pointer to first byte of destination
 196 0:      ldr     x6,     [x1],#8
 197         str     x6,     [x3],#8
 198 L_forwardSmallCopy:
 199         subs    x2,      x2, #8
 200         b.cs    0b
 201         adds    x2,      x2, #8
 202         b.eq    2f
 203 1:      ldrb    w6,     [x1],#1
 204         strb    w6,     [x3],#1
 205         subs    x2,      x2, #1
 206         b.ne    1b
 207 2:      POP_FRAME
 208         ARM64_STACK_EPILOG
 209
 210 /*****************************************************************************
 211  *  Reverse copy engines                                                     *
 212  *****************************************************************************/
 213
 214 //      The reverse copy engines are identical in every way to the forward copy
 215 //      engines, except in that they do everything backwards.  For this reason, they
 216 //      are somewhat more sparsely commented than the forward copy loops.  I have
 217 //      tried to only comment things that might be somewhat surprising in how they
 218 //      differ from the forward implementation.
 219 //
 220 //      The one important thing to note is that (almost without fail), x1 and x3
 221 //      will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
 222 //      throughout these copy loops.  They are initially advanced to that position
 223 //      in the L_reverse jump island.  Because of this, whereas the forward copy
 224 //      loops generally follow a "copy data, then advance pointers" scheme, in the
 225 //      reverse copy loops, we advance the pointers, then copy the data.
 226
 227 L_reverse:
 228 //      As a minor optimization, we early out if dst == src.
 229         cbz     x3,      L_return
 230 //      advance both pointers to the ends of their respective buffers before
 231 //      jumping into the appropriate reverse copy loop.
 232         add     x4,      x0, x2
 233         add     x1,      x1, x2
 234         cmp     x2,      #(kSmallCopy)
 235         b.cc    L_reverseSmallCopy
 236
 237 /*****************************************************************************
 238  *  Reverse large copy                                                       *
 239  *****************************************************************************/
 240
 241         ldp     x12,x13,[x1, #-16]
 242         ldp     x14,x15,[x1, #-32]
 243         sub     x3,      x4, #1   // In the forward copy, we used dst+32 & -32
 244         and     x3,      x3, #-32 // to find an aligned location in the dest
 245         sub     x5,      x4, x3   // buffer.  Here we use dst-1 & -32 instead,
 246         sub     x1,      x1, x5   // because we are going backwards.
 247         sub     x2,      x2, x5
 248         ldp     x8, x9, [x1, #-16]
 249         ldp     x10,x11,[x1, #-32]
 250         stp     x12,x13,[x4, #-16]
 251         stp     x14,x15,[x4, #-32]
 252         sub     x1,      x1, #32
 253         subs    x2,      x2, #64
 254         b.ls    L_reverseCleanup
 255
 256 L_reverseCopyLoop:
 257         stnp    x8, x9, [x3, #-16]
 258         stnp    x10,x11,[x3, #-32]
 259         sub     x3,      x3, #32
 260         ldnp    x8, x9, [x1, #-16]
 261         ldnp    x10,x11,[x1, #-32]
 262         sub     x1,      x1, #32
 263         subs    x2,      x2, #32
 264         b.hi    L_reverseCopyLoop
 265
 266 L_reverseCleanup:
 267         sub     x1,      x1, x2
 268         ldp     x12,x13,[x1, #-16]
 269         ldp     x14,x15,[x1, #-32]
 270         stp     x8, x9, [x3, #-16]
 271         stp     x10,x11,[x3, #-32]
 272         stp     x12,x13,[x0, #16] // In the forward copy, we need to compute the
 273         stp     x14,x15,[x0]      // address of these stores, but here we already
 274         POP_FRAME       // have a pointer to the start of the buffer.
 275         ARM64_STACK_EPILOG
 276
 277 /*****************************************************************************
 278  *  reverse small copy                                                       *
 279  *****************************************************************************/
 280
 281 0:      ldr     x6,     [x1,#-8]!
 282         str     x6,     [x4,#-8]!
 283 L_reverseSmallCopy:
 284         subs    x2,      x2, #8
 285         b.cs    0b
 286         adds    x2,      x2, #8
 287         b.eq    2f
 288 1:      ldrb    w6,     [x1,#-1]!
 289         strb    w6,     [x4,#-1]!
 290         subs    x2,      x2, #1
 291         b.ne    1b
 292 2:      POP_FRAME
 293         ARM64_STACK_EPILOG
 294
 295
 296 L_return:
 297         POP_FRAME
 298         ARM64_STACK_EPILOG