osfmk/arm64/bcopy.s

   1 /*
   2  * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  *  This file implements the following functions for the arm64 architecture.
  29  *
  30  *  void bcopy(const void * source,
  31  *             void * destination,
  32  *             size_t length);
  33  *
  34  *  void *memmove(void * destination,
  35  *                const void * source,
  36  *                size_t n);
  37  *
  38  *  void *memcpy(void * restrict destination,
  39  *               const void * restrict source,
  40  *               size_t n);
  41  *
  42  * All copy n successive bytes from source to destination.  Memmove and memcpy
  43  * return destination, whereas bcopy has no return value.  Copying takes place
  44  * as if it were through a temporary buffer -- after return destination
  45  * contains exactly the bytes from source, even if the buffers overlap (this is
  46  * not required of memcpy by the C standard; its behavior is undefined if the
  47  * buffers overlap, but we are holding ourselves to the historical behavior of
  48  * this function on MacOS).
  49  */
  50
  51 #include "asm.h"
  52
  53 .globl _bcopy
  54 .globl _ovbcopy
  55 .globl _memcpy
  56 .globl _memmove
  57
  58 /*****************************************************************************
  59  *  Macros                                                                   *
  60  *****************************************************************************/
  61
  62 #define kSmallCopy 64
  63
  64 /*****************************************************************************
  65  *  Entrypoints                                                              *
  66  *****************************************************************************/
  67
  68 .text
  69 .align 5
  70 _bcopy:
  71 _ovbcopy:
  72 //  Translate bcopy into memcpy by swapping the first and second arguments.
  73         mov     x3,      x0
  74         mov     x0,      x1
  75         mov     x1,      x3
  76
  77 .align 4
  78 _memcpy:
  79 _memmove:
  80 //      Our preference is to copy the data in ascending address order, but if the
  81 //      buffers overlap such that the beginning of the destination buffer aliases
  82 //      the end of the source buffer, we need to copy in descending address order
  83 //      instead to preserve the memmove semantics.  We detect this case with the
  84 //      test:
  85 //
  86 //          destination - source < length    (unsigned compare)
  87 //
  88 //      If the address of the source buffer is higher than the address of the
  89 //      destination buffer, this arithmetic can overflow, but the overflowed value
  90 //      can only be smaller than length if the buffers do not overlap, so we don't
  91 //      need to worry about false positives due to the overflow (they happen, but
  92 //      only in cases where copying in either order is correct).
  93         PUSH_FRAME
  94         sub     x3,      x0, x1
  95         cmp     x3,      x2
  96         b.cc    L_reverse
  97         mov     x3,      x0      // copy destination pointer
  98         cmp     x2,      #(kSmallCopy)
  99         b.cc    L_forwardSmallCopy
 100
 101 /*****************************************************************************
 102  *  Forward large copy                                                       *
 103  *****************************************************************************/
 104
 105 //      Load the first 32 bytes from src, and compute the number of bytes to the
 106 //      first 32-byte aligned location in dst.  Even though we are going to copy
 107 //      32 bytes, only those preceeding that 32-byte location "count" towards
 108 //      reducing the length of the buffer or advancing the pointers.  We will need
 109 //      to issue the first load from the advanced src pointer BEFORE the store to
 110 //      the unmodified dst pointer.
 111         add     x3,      x3, #32
 112         and     x3,      x3, #-32 // aligned dst
 113         ldp     x12,x13,[x1]
 114         ldp     x14,x15,[x1, #16]
 115         sub     x5,      x3, x0   // bytes between original dst and aligned dst
 116         add     x1,      x1, x5   // update src pointer
 117
 118 //      At this point, data in the following registers is in flight:
 119 //
 120 //              x0    original dst pointer
 121 //              x1    corresponding location in src buffer.
 122 //              x2    length from aligned location in dst to end of buffer.  This is
 123 //                    guaranteed to be >= (64 - 32).
 124 //              x3    aligned location in dst buffer.
 125 //              x12:x15 first 32 bytes of src buffer.
 126 //
 127 //      We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3.  The
 128 //      store *may* overlap the first 32 bytes of the load, so in order to get
 129 //      correct memmove semantics, the first 32 byte load must occur before the
 130 //      store.
 131 //
 132 //      After loading these 32 bytes, we advance x1, and decrement the length by
 133 //      64.  If the remaining length of the buffer was less than 64, then we jump
 134 //      directly to the cleanup path.
 135         ldp     x8, x9, [x1]
 136         ldp     x10,x11,[x1, #16]
 137         add     x1,      x1, #32
 138         sub     x2,      x2, x5   // update length
 139         stp     x12,x13,[x0]      // initial unaligned store
 140         stp     x14,x15,[x0, #16] // initial unaligned store
 141         subs    x2,      x2, #64
 142         b.ls    L_forwardCleanup
 143
 144 L_forwardCopyLoop:
 145 //      Main copy loop:
 146 //
 147 //              1. store the 32 bytes loaded in the previous loop iteration
 148 //              2. advance the destination pointer
 149 //              3. load the next 32 bytes
 150 //              4. advance the source pointer
 151 //              5. subtract 32 from the length
 152 //
 153 //      The loop is terminated when 32 or fewer bytes remain to be loaded.  Those
 154 //      trailing 1-32 bytes will be copied in the loop cleanup.
 155         stnp    x8, x9, [x3]
 156         stnp    x10,x11,[x3, #16]
 157         add     x3,      x3, #32
 158         ldnp    x8, x9, [x1]
 159         ldnp    x10,x11,[x1, #16]
 160         add     x1,      x1, #32
 161         subs    x2,      x2, #32
 162         b.hi    L_forwardCopyLoop
 163
 164 L_forwardCleanup:
 165 //      There are 32 bytes in x8-x11 that were loaded in the previous loop
 166 //      iteration, which need to be stored to [x3,x3+32).  In addition, between
 167 //  0 and 32 more bytes need to be copied from x1 to x3 + 32.  The exact
 168 //      number of bytes to copy is x2 + 32.  Instead of using smaller conditional
 169 //      copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
 170 //      This copy may overlap with the first store, so the loads must come before
 171 //      the store of the data from the previous loop iteration.
 172         add     x1,      x1, x2
 173         ldp     x12,x13,[x1]
 174         ldp     x14,x15,[x1, #16]
 175         stp     x8, x9, [x3]
 176         stp     x10,x11,[x3, #16]
 177         add     x3,      x3, x2
 178         stp     x12,x13,[x3, #32]
 179         stp     x14,x15,[x3, #48]
 180         POP_FRAME
 181         ret
 182
 183 /*****************************************************************************
 184  *  forward small copy                                                       *
 185  *****************************************************************************/
 186
 187 //      Copy one quadword at a time until less than 8 bytes remain to be copied.
 188 //      At the point of entry to L_forwardSmallCopy, the "calling convention"
 189 //      is as follows:
 190 //
 191 //        x0     pointer to first byte of destination
 192 //        x1     pointer to first byte of source
 193 //        x2     length of buffers
 194 //        x3     pointer to first byte of destination
 195 0:      ldr     x6,     [x1],#8
 196         str     x6,     [x3],#8
 197 L_forwardSmallCopy:
 198         subs    x2,      x2, #8
 199         b.cs    0b
 200         adds    x2,      x2, #8
 201         b.eq    2f
 202 1:      ldrb    w6,     [x1],#1
 203         strb    w6,     [x3],#1
 204         subs    x2,      x2, #1
 205         b.ne    1b
 206 2:      POP_FRAME
 207         ret
 208
 209 /*****************************************************************************
 210  *  Reverse copy engines                                                     *
 211  *****************************************************************************/
 212
 213 //      The reverse copy engines are identical in every way to the forward copy
 214 //      engines, except in that they do everything backwards.  For this reason, they
 215 //      are somewhat more sparsely commented than the forward copy loops.  I have
 216 //      tried to only comment things that might be somewhat surprising in how they
 217 //      differ from the forward implementation.
 218 //
 219 //      The one important thing to note is that (almost without fail), x1 and x3
 220 //      will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
 221 //      throughout these copy loops.  They are initially advanced to that position
 222 //      in the L_reverse jump island.  Because of this, whereas the forward copy
 223 //      loops generally follow a "copy data, then advance pointers" scheme, in the
 224 //      reverse copy loops, we advance the pointers, then copy the data.
 225
 226 L_reverse:
 227 //      As a minor optimization, we early out if dst == src.
 228         cbz     x3,      L_return
 229 //      advance both pointers to the ends of their respective buffers before
 230 //      jumping into the appropriate reverse copy loop.
 231         add     x4,      x0, x2
 232         add     x1,      x1, x2
 233         cmp     x2,      #(kSmallCopy)
 234         b.cc    L_reverseSmallCopy
 235
 236 /*****************************************************************************
 237  *  Reverse large copy                                                       *
 238  *****************************************************************************/
 239
 240         ldp     x12,x13,[x1, #-16]
 241         ldp     x14,x15,[x1, #-32]
 242         sub     x3,      x4, #1   // In the forward copy, we used dst+32 & -32
 243         and     x3,      x3, #-32 // to find an aligned location in the dest
 244         sub     x5,      x4, x3   // buffer.  Here we use dst-1 & -32 instead,
 245         sub     x1,      x1, x5   // because we are going backwards.
 246         sub     x2,      x2, x5
 247         ldp     x8, x9, [x1, #-16]
 248         ldp     x10,x11,[x1, #-32]
 249         stp     x12,x13,[x4, #-16]
 250         stp     x14,x15,[x4, #-32]
 251         sub     x1,      x1, #32
 252         subs    x2,      x2, #64
 253         b.ls    L_reverseCleanup
 254
 255 L_reverseCopyLoop:
 256         stnp    x8, x9, [x3, #-16]
 257         stnp    x10,x11,[x3, #-32]
 258         sub     x3,      x3, #32
 259         ldnp    x8, x9, [x1, #-16]
 260         ldnp    x10,x11,[x1, #-32]
 261         sub     x1,      x1, #32
 262         subs    x2,      x2, #32
 263         b.hi    L_reverseCopyLoop
 264
 265 L_reverseCleanup:
 266         sub     x1,      x1, x2
 267         ldp     x12,x13,[x1, #-16]
 268         ldp     x14,x15,[x1, #-32]
 269         stp     x8, x9, [x3, #-16]
 270         stp     x10,x11,[x3, #-32]
 271         stp     x12,x13,[x0, #16] // In the forward copy, we need to compute the
 272         stp     x14,x15,[x0]      // address of these stores, but here we already
 273         POP_FRAME       // have a pointer to the start of the buffer.
 274         ret
 275
 276 /*****************************************************************************
 277  *  reverse small copy                                                       *
 278  *****************************************************************************/
 279
 280 0:      ldr     x6,     [x1,#-8]!
 281         str     x6,     [x4,#-8]!
 282 L_reverseSmallCopy:
 283         subs    x2,      x2, #8
 284         b.cs    0b
 285         adds    x2,      x2, #8
 286         b.eq    2f
 287 1:      ldrb    w6,     [x1,#-1]!
 288         strb    w6,     [x4,#-1]!
 289         subs    x2,      x2, #1
 290         b.ne    1b
 291 2:      POP_FRAME
 292         ret
 293
 294 L_return:
 295         POP_FRAME
 296         ret