arm/string/memset_pattern_Swift.s

   1 /*
   2  * Copyright (c) 2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  *
  23  *  This file implements the following functions for the Swift micro-arch:
  24  *
  25  *  void memset_pattern4(void *b, const void *pattern4, size_t len);
  26  *  void memset_pattern8(void *b, const void *pattern8, size_t len);
  27  *  void memset_pattern16(void *b, const void *pattern16, size_t len);
  28  *
  29  *  The implementation of all three functions is fundamentally the same.
  30  *  memset_pattern4 is extensively commented to explain, reference that
  31  *  if you have any questions about the other two.
  32  */
  33
  34 #include <arm/arch.h>
  35 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
  36
  37 .syntax unified
  38 .code 32
  39 .text
  40 .globl _memset_pattern4$VARIANT$Swift
  41 .globl _memset_pattern8$VARIANT$Swift
  42 .globl _memset_pattern16$VARIANT$Swift
  43
  44 /******************************************************************************/
  45
  46 .align 4
  47 _memset_pattern4$VARIANT$Swift:
  48     push    {r7,lr}
  49     mov     r7,         sp
  50
  51 //  Load the pattern and splat it to q0, then check if the buffer is at least
  52 //  64 bytes long.  If not, branch to a short-buffer implementation.
  53     ldr     r1,        [r1]
  54     vdup.32 q0,         r1
  55     subs    r3,     r2, #64
  56     blo     L_short4
  57
  58 //  We want to use aligned vector stores to fill the bulk of the buffer.  In
  59 //  order to make that work, we need to rotate the pattern as necessary to
  60 //  match up with aligned locations, and we also need to extract the alignment
  61 //  of the destination pointer mod 16.
  62     lsl     ip,     r0, #3
  63     and     lr,     r0, #0xf    //  alignment of destination pointer mod 16
  64     rsb     ip,     ip, #32     //  low five bits contain 32 - 8*(address%4).
  65
  66 //  Before we start the aligned stores, we do a single unaligned store of
  67 //  16 bytes of the pattern to the start of the buffer.  Since the buffer is
  68 //  at least 64 bytes long, this store is known to lie entirely inside the
  69 //  buffer:
  70 //                              first aligned address in buffer
  71 //                                                            v
  72 //   ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
  73 //  ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ...
  74 //   ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
  75 //        ^
  76 //        unaligned store starts here:
  77 //      [ 0   1   2   3   0   1   2   3   0   1   2   3   0   1   2   3 ]
  78     vst1.8  {q0},      [r0]!
  79
  80 //  Subsequent stores will be aligned, and will start at the first aligned
  81 //  address in the buffer.  We apply the rotation that we calculated before
  82 //  the vector store (in the low five bits of ip) to get the pattern that
  83 //  is to be stored starting at the aligned location.  For example, in the
  84 //  picture above, the buffer had alignment of 3 mod 4, so the rotation to
  85 //  be applied is 32 - 8*3 = 8.  Rotating the pattern right by 8 bits gives
  86 //  us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
  87 //  needs to be stored starting at the first aligned location.
  88 //
  89 //  Besides rotating the pattern, we also need to adjust the length (by
  90 //  subtracting 16 - alignment mod 16), and to advance the pointer to the
  91 //  first aligned location.
  92     ror     r1,         ip      //  Pattern to use for aligned memory
  93     add     r3,         lr
  94     bic     r0,         #0xf    //  destination for first aligned store
  95     subs    r3,         #16     //  updated length
  96     blo     1f
  97
  98 //  Splat the rotated value across q1 and q2
  99     vdup.32 q1,         r1
 100     vmov    q2,         q1
 101
 102 //  Main store loop.  We write the splatted aligned pattern across 64 bytes
 103 //  per iteration, terminating the loop when the remaining length of the
 104 //  buffer is 64 bytes or less.
 105 0:  subs    r3,         #64
 106     vst1.32 {q1,q2}, [r0,:128]!
 107     vst1.32 {q1,q2}, [r0,:128]!
 108     bhi     0b
 109
 110 //  The remaining length of the buffer is 64 bytes or less (but the total
 111 //  length of the buffer is at least 64 bytes; otherwise we would have
 112 //  branched to the "short" path).  Thus, we can handle the entirety of the
 113 //  remaining buffer with two 32-byte unaligned stores.
 114 //
 115 //  Again, we need to rotate the pattern to match the alignment, this time
 116 //  by 8*(length%4), and we also need to back up the destination pointer
 117 //  so that it points to precisely 64 bytes before the end of the buffer.
 118 //  We accomplish this by adding r3, which contains the remaining length of
 119 //  the buffer minus 64.
 120 1:  lsl     ip,     r3, #3
 121     ror     r1,         ip
 122     vdup.32 q8,         r1
 123     vmov    q9,         q8
 124     add     r0,         r3
 125     vst1.32 {q8,q9},   [r0]!
 126     vst1.32 {q8,q9},   [r0]
 127     pop     {r7,pc}
 128
 129 L_short4:
 130 //  If we branch here, the buffer is less than 64 bytes long.  At this point,
 131 //  register contents are as follows:
 132 //
 133 //      r0      pointer to the buffer
 134 //      r1      pattern
 135 //      r2      buffer length
 136 //      q0      splatted pattern
 137 //
 138 //  To begin, we store eight bytes at a time until the remaining length is
 139 //  less than eight bytes.
 140     subs    r3,     r2, #8
 141     blo     1f
 142 0:  subs    r3,         #8
 143     vst1.32 {d0},      [r0]!
 144     bhs     0b
 145
 146 //  Then we store one byte at a time, rotating the pattern to get the next
 147 //  byte, until we reach the end of the buffer.
 148     add     r2,     r3, #8
 149 1:  subs    r2,         #1
 150     strbhs  r1,    [r0],#1
 151     ror     r1,         #8
 152     bhi     1b
 153     pop     {r7,pc}
 154
 155 /******************************************************************************/
 156
 157 .align 4
 158 _memset_pattern8$VARIANT$Swift:
 159 //  The implementation of this function is substantially identical to that of
 160 //  memset_pattern4.  The only differences are in how we rotate the pattern for
 161 //  the purposes of extracting the bytes to store.  For clarity, only those
 162 //  differences are commented here; consult memset_pattern4 (above) for
 163 //  a detailed description of the algorithm used.
 164     push    {r7,lr}
 165     mov     r7,         sp
 166     vld1.8  {d0},      [r1]
 167     vmov    d1,         d0
 168     subs    r3,     r2, #64
 169     blo     L_short8
 170
 171     bic     sp,         #0xf    //  Align stack to 16 bytes and write 32 bytes
 172     sub     sp,         #16     //  of pattern to the stack.  We will use
 173     vst1.8  {q0},    [sp,:128]  //  unaligned loads from this scratch buffer
 174     sub     sp,         #16     //  to get rotated forms of the pattern.
 175     vst1.8  {q0},    [sp,:128]
 176     and     ip,     r0, #0x7    //  Now generate an unaligned pointer to the
 177     rsb     ip,     ip, #8      //  rotated pattern that we need to use for
 178     add     ip,         sp      //  aligned stores in the main loop.
 179     and     lr,     r0, #0xf
 180     vst1.8  {q0},      [r0]!
 181     add     r3,         lr
 182     bic     r0,         #0xf
 183     subs    r3,         #16
 184     blo     1f
 185     vld1.8  {q1},      [ip]
 186     vmov    q2,         q1
 187 0:  subs    r3,         #64
 188     vst1.32 {q1,q2}, [r0,:128]!
 189     vst1.32 {q1,q2}, [r0,:128]!
 190     bhi     0b
 191 1:  and     lr,     r3, #0x7    //  Generate an unaligned pointer to the
 192     add     ip,         lr      //  rotated pattern to use for cleanup.
 193     vld1.8  {q8},      [ip]
 194     vmov    q9,         q8
 195     add     r0,         r3
 196     vst1.32 {q8,q9},   [r0]!
 197     vst1.32 {q8,q9},   [r0]
 198     mov     sp,         r7      //  Restore stack pointer
 199     pop     {r7,pc}
 200
 201 L_short8:
 202     subs    r2,         #8
 203     blo     1f
 204 0:  subs    r2,         #8
 205     vst1.32 {d0},      [r0]!
 206     bhs     0b
 207 1:  adds    r2,         #8
 208     beq     3f
 209 2:  vst1.8  {d0[0]},   [r0]!    //  Store one byte from NEON
 210     vext.8  d0,     d0, d0, #1  //  Use VEXT to rotate pattern
 211     subs    r2,         #1
 212     bhi     2b
 213 3:  pop     {r7,pc}
 214
 215 /******************************************************************************/
 216
 217 .align 4
 218 _memset_pattern16$VARIANT$Swift:
 219 //  The implementation of this function is substantially identical to that of
 220 //  memset_pattern4.  The only differences are in how we rotate the pattern for
 221 //  the purposes of extracting the bytes to store.  For clarity, only those
 222 //  differences are commented here; consult memset_pattern4 (above) for
 223 //  a detailed description of the algorithm used.
 224     push    {r7,lr}
 225     mov     r7,         sp
 226     vld1.8  {q0},      [r1]
 227     subs    r3,     r2, #64
 228     blo     L_short16
 229
 230     bic     sp,         #0xf    //  Align stack to 16 bytes and write 48 bytes
 231     sub     sp,         #16     //  of pattern to the stack.  We will use
 232     vst1.8  {q0},    [sp,:128]  //  unaligned loads from this scratch buffer
 233     sub     sp,         #16     //  to get rotated forms of the pattern.
 234     vst1.8  {q0},    [sp,:128]
 235     sub     sp,         #16
 236     vst1.8  {q0},    [sp,:128]
 237     and     lr,     r0, #0xf    //  Now generate an unaligned pointer to the
 238     rsb     ip,     lr, #16     //  rotated pattern that we need to use for
 239     add     ip,         sp      //  aligned stores in the main loop.
 240     vst1.8  {q0},      [r0]!
 241     add     r3,         lr
 242     bic     r0,         #0xf
 243     subs    r3,         #16
 244     blo     1f
 245     vld1.8  {q1},      [ip]
 246     vmov    q2,         q1
 247 0:  subs    r3,         #64
 248     vst1.32 {q1,q2}, [r0,:128]!
 249     vst1.32 {q1,q2}, [r0,:128]!
 250     bhi     0b
 251 1:  and     lr,     r3, #0xf    //  Generate an unaligned pointer to the
 252     add     ip,         lr      //  rotated pattern to use for cleanup.
 253     vld1.8  {q8},      [ip]
 254     vmov    q9,         q8
 255     add     r0,         r3
 256     vst1.32 {q8,q9},   [r0]!
 257     vst1.32 {q8,q9},   [r0]
 258     mov     sp,         r7      //  Restore stack pointer
 259     pop     {r7,pc}
 260
 261 L_short16:
 262     subs    r2,         #16
 263     blo     1f
 264 0:  subs    r2,         #16
 265     vst1.32 {q0},      [r0]!
 266     bhs     0b
 267 1:  adds    r2,         #16
 268     beq     3f
 269 2:  vst1.8  {d0[0]},   [r0]!    //  Store one byte from NEON
 270     vext.8  q0,     q0, q0, #1  //  Use VEXT to rotate pattern
 271     subs    r2,         #1
 272     bhi     2b
 273 3:  pop     {r7,pc}
 274
 275 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD