x86_64/string/memset.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <machine/cpu_capabilities.h>
  24
  25
  26 /* This file contains the following functions:
  27  *
  28  *      void *memset(void *b, int c, size_t len);
  29  *      void memset_pattern4(void *b, const void *c4, size_t len);
  30  *      void memset_pattern8(void *b, const void *c8, size_t len);
  31  *      void memset_pattern16(void *b, const void *c16, size_t len);
  32  *
  33  * Calls of memset() with c==0 are routed to the bzero() routine.  Most of the
  34  * others go to _memset_pattern, which is entered as follows:
  35  *      %rdi = ptr to memory to set (aligned)
  36  *      %edx = length (which can be short, though we bias in favor of long operands)
  37  *      %xmm0 = the pattern to store
  38  * Return conditions:
  39  *      %eax, %edi, %esi, %ecx, and %edx all trashed
  40  *
  41  * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
  42  * on P4s and probably other processors.
  43  */
  44
  45 #define kShort  255                     // for nonzero memset(), too short for commpage
  46
  47
  48         .text
  49         .globl  _memset
  50         .align  2
  51 _memset:                                // void *memset(void *b, int c, size_t len);
  52         andl    $0xFF,%esi              // (c==0) ?
  53         jnz             LNonzero                // not a bzero
  54
  55         movq    %rdx,%rsi               // put count where bzero() expects it
  56         jmp             _bzero                  // enter _bzero
  57
  58
  59 // Handle memset of a nonzero value.
  60
  61 LNonzero:
  62         movq    %rdi,%r8                // preserve the original pointer so we can return it
  63         movl    %esi,%eax               // replicate byte in %esi into all four bytes
  64         shll    $8,%esi
  65         orl             %esi,%eax
  66         movl    %eax,%esi
  67         shll    $16,%esi
  68         orl             %esi,%eax               // now %eax has "c" in all 4 bytes
  69         cmpq    $(kShort),%rdx          // is operand too short for SSE?
  70         ja              LCallCommpage           // no
  71
  72         // Nonzero memset() too short to call commpage.
  73         //      %eax = replicated 4-byte pattern
  74         //      %rdi = ptr
  75         //      %edx = length (<= kShort)
  76
  77         cmpl    $16,%edx                // long enough to word align?
  78         jge     3f                      // yes
  79         test    %edx,%edx               // length==0?
  80         jz      6f
  81 1:
  82         movb    %al,(%rdi)              // pack in a byte
  83         addq    $1,%rdi
  84         subl    $1,%edx
  85         jnz     1b
  86         jmp     6f
  87 2:
  88         movb    %al,(%rdi)              // pack in a byte
  89         addq    $1,%rdi
  90         subl    $1,%edx
  91 3:
  92         test    $3,%edi                 // is ptr doubleword aligned?
  93         jnz     2b                      // no
  94         movl    %edx,%ecx               // copy length
  95         shrl    $2,%edx                 // #doublewords to store
  96 4:
  97         movl    %eax,(%rdi)             // store aligned doubleword
  98         addq    $4,%rdi
  99         subl    $1,%edx
 100         jnz     4b
 101         andl    $3,%ecx                 // any leftover bytes?
 102         jz      6f                      // no
 103 5:
 104         movb    %al,(%rdi)              // pack in a byte
 105         addq    $1,%rdi
 106         subl    $1,%ecx
 107         jnz     5b
 108 6:
 109         movq    %r8,%rax                // get return value (ie, original ptr)
 110         ret
 111
 112         // Nonzero memset() is long enough to call commpage.
 113         //      %eax = replicated 4-byte pattern
 114         //      %rdi = ptr
 115         //      %rdx = length (> kShort)
 116
 117 LCallCommpage:
 118         movd    %eax,%xmm0              // move %eax to low 4 bytes of %xmm0
 119         pshufd  $(0x00),%xmm0,%xmm0     // replicate across the vector
 120         movq    %rdi,%rcx               // copy dest ptr
 121         negl    %ecx
 122         andl    $15,%ecx                // get #bytes to align ptr
 123         jz      2f                      // skip if already aligned
 124         subq    %rcx,%rdx               // decrement length
 125 1:
 126         movb    %al,(%rdi)              // pack in a byte
 127         addq    $1,%rdi
 128         subl    $1,%ecx
 129         jnz     1b
 130 2:                                      // ptr aligned, length long enough to justify
 131         call    Lmemset_pattern // call commpage to do the heavy lifting
 132         movq    %r8,%rax                // get return value (ie, original ptr)
 133         ret
 134
 135
 136         // Handle memset of a 16-byte pattern.
 137
 138         .globl  _memset_pattern16
 139         .align  2, 0x90
 140 _memset_pattern16:                      // void memset_pattern16(void *b, const void *c16, size_t len);
 141         movdqu  (%rsi),%xmm0            // load the pattern
 142         jmp     LAlignPtr
 143
 144
 145         // Handle memset of an 8-byte pattern.
 146
 147         .globl  _memset_pattern8
 148         .align  2, 0x90
 149 _memset_pattern8:                       // void memset_pattern8(void *b, const void *c8, size_t len);
 150         movq    (%rsi),%xmm0            // load pattern into low 8 bytes
 151         punpcklqdq %xmm0,%xmm0          // replicate into all 16
 152         jmp     LAlignPtr
 153
 154         // Handle memset of a 4-byte pattern.
 155
 156         .globl  _memset_pattern4
 157         .align  2, 0x90
 158 _memset_pattern4:                       // void memset_pattern4(void *b, const void *c4, size_t len);
 159         movd    (%rsi),%xmm0            // load pattern into low 4 bytes
 160         pshufd  $(0x00),%xmm0,%xmm0     // replicate the 4 bytes across the vector
 161
 162
 163         // Align ptr if necessary.  We must rotate the pattern right for each byte we
 164         // store while aligning the ptr.  Since there is no rotate instruction in SSE3,
 165         // we have to synthesize the rotates.
 166         //      %rdi = ptr
 167         //      %rdx = length
 168         //      %xmm0 = pattern
 169
 170 LAlignPtr:                              // NB: can drop down to here!
 171         cmpq    $100,%rdx               // long enough to bother aligning ptr?
 172         movq    %rdi,%rcx               // copy ptr
 173         jb      LReady                  // not long enough
 174         negl    %ecx
 175         andl    $15,%ecx                // get #bytes to align ptr
 176         jz      LReady                  // already aligned
 177         subq    %rcx,%rdx               // adjust length
 178
 179         test    $1,%cl                  // 1-byte store required?
 180         movd    %xmm0,%eax              // get 4 low bytes in %eax
 181         jz      2f                      // no
 182         movdqa  %xmm0,%xmm1             // copy pattern so we can shift in both directions
 183         movb    %al,(%rdi)              // pack in the low-order byte
 184         psrldq  $1,%xmm0                // shift pattern right 1 byte
 185         addq    $1,%rdi
 186         pslldq  $15,%xmm1               // shift pattern left 15 bytes
 187         shrl    $8,%eax                 // in case 2-byte store is required
 188         por     %xmm1,%xmm0             // complete right rotate of pattern by 1 byte
 189 2:
 190         test    $2,%cl                  // 2-byte store required?
 191         jz      4f                      // no
 192         psrldq  $2,%xmm0                // shift pattern down 2 bytes
 193         movw    %ax,(%rdi)              // pack in next two bytes
 194         pinsrw  $7,%eax,%xmm0           // insert low word of %eax into high word of %xmm0
 195         addq    $2,%rdi                 // adjust ptr
 196 4:
 197         test    $4,%cl                  // 4-byte store required?
 198         jz      8f                      // no
 199         movd    %xmm0,(%rdi)            // store low 4 bytes of %xmm0
 200         pshufd  $(0x39),%xmm0,%xmm0     // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
 201         addq    $4,%rdi                 // adjust ptr
 202 8:
 203         test    $8,%cl                  // 8-byte store required?
 204         jz      LReady                  // no
 205         movq    %xmm0,(%rdi)            // store low 8 bytes of %xmm0
 206         pshufd  $(0x4e),%xmm0,%xmm0     // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
 207         addq    $8,%rdi                 // adjust ptr
 208
 209         // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
 210
 211 LReady:
 212         call    Lmemset_pattern // call commpage to do the heavy lifting
 213         ret
 214
 215
 216 #define kLShort         63
 217 #define kVeryLong       (1024*1024)
 218
 219 Lmemset_pattern:
 220         cmpq    $(kLShort),%rdx         // long enough to bother aligning?
 221         ja      LNotShort               // yes
 222         jmp     LShort                  // no
 223
 224         // Here for short operands or the end of long ones.
 225         //      %rdx = length (<= kLShort)
 226         //      %rdi = ptr (may not be not aligned)
 227         //      %xmm0 = pattern
 228
 229 LUnalignedStore16:
 230         movdqu  %xmm0,(%rdi)            // stuff in another 16 bytes
 231         subl    $16,%edx
 232         addq    $16,%rdi
 233 LShort:
 234         cmpl    $16,%edx                // room for another vector?
 235         jge     LUnalignedStore16       // yes
 236 LLessThan16:                            // here at end of copy with < 16 bytes remaining
 237         test    $8,%dl                  // 8-byte store required?
 238         jz      2f                      // no
 239         movq    %xmm0,(%rdi)            // pack in 8 low bytes
 240         psrldq  $8,%xmm0                // then shift vector down 8 bytes
 241         addq    $8,%rdi
 242 2:
 243         test    $4,%dl                  // 4-byte store required?
 244         jz      3f                      // no
 245         movd    %xmm0,(%rdi)            // pack in 4 low bytes
 246         psrldq  $4,%xmm0                // then shift vector down 4 bytes
 247         addq    $4,%rdi
 248 3:
 249         andl    $3,%edx                 // more to go?
 250         jz      5f                      // no
 251         movd    %xmm0,%eax              // move remainders out into %eax
 252 4:                                      // loop on up to three bytes
 253         movb    %al,(%rdi)              // pack in next byte
 254         shrl    $8,%eax                 // shift next byte into position
 255         incq    %rdi
 256         dec     %edx
 257         jnz     4b
 258 5:      ret
 259
 260 // Long enough to justify aligning ptr.  Note that we have to rotate the
 261 // pattern to account for any alignment.  We do this by doing two unaligned
 262 // stores, and then an aligned load from the middle of the two stores.
 263 // This will stall on store forwarding alignment mismatch, and the unaligned
 264 // stores can be pretty slow too, but the alternatives aren't any better.
 265 // Fortunately, in most cases our caller has already aligned the ptr.
 266 //      %rdx = length (> kLShort)
 267 //      %rdi = ptr (may not be aligned)
 268 //      %xmm0 = pattern
 269
 270 LNotShort:
 271         movl    %edi,%ecx               // copy low bits of dest ptr
 272         negl    %ecx
 273         andl    $15,%ecx                // mask down to #bytes to 16-byte align
 274         jz      LAligned                // skip if already aligned
 275         movdqu  %xmm0,(%rdi)            // store 16 unaligned bytes
 276         movdqu  %xmm0,16(%rdi)          // and 16 more, to be sure we have an aligned chunk
 277         addq    %rcx,%rdi               // now point to the aligned chunk
 278         subq    %rcx,%rdx               // adjust remaining count
 279         movdqa  (%rdi),%xmm0            // get the rotated pattern (probably stalling)
 280         addq    $16,%rdi                // skip past the aligned chunk
 281         subq    $16,%rdx
 282
 283 // Set up for 64-byte loops.
 284 //      %rdx = length remaining
 285 //      %rdi = ptr (aligned)
 286 //      %xmm0 = rotated pattern
 287
 288 LAligned:
 289         movq    %rdx,%rcx               // copy length remaining
 290         andl    $63,%edx                // mask down to residual length (0..63)
 291         andq    $-64,%rcx               // %ecx <- #bytes we will zero in by-64 loop
 292         jz      LNoMoreChunks           // no 64-byte chunks
 293         addq    %rcx,%rdi               // increment ptr by length to move
 294         cmpq    $(kVeryLong),%rcx       // long enough to justify non-temporal stores?
 295         jge     LVeryLong               // yes
 296         negq    %rcx                    // negate length to move
 297         jmp     1f
 298
 299 // Loop over 64-byte chunks, storing into cache.
 300
 301         .align  4,0x90                  // keep inner loops 16-byte aligned
 302 1:
 303         movdqa  %xmm0,(%rdi,%rcx)
 304         movdqa  %xmm0,16(%rdi,%rcx)
 305         movdqa  %xmm0,32(%rdi,%rcx)
 306         movdqa  %xmm0,48(%rdi,%rcx)
 307         addq    $64,%rcx
 308         jne     1b
 309
 310         jmp     LNoMoreChunks
 311
 312 // Very long operands: use non-temporal stores to bypass cache.
 313
 314 LVeryLong:
 315         negq    %rcx                    // negate length to move
 316         jmp     1f
 317
 318         .align  4,0x90                  // keep inner loops 16-byte aligned
 319 1:
 320         movntdq %xmm0,(%rdi,%rcx)
 321         movntdq %xmm0,16(%rdi,%rcx)
 322         movntdq %xmm0,32(%rdi,%rcx)
 323         movntdq %xmm0,48(%rdi,%rcx)
 324         addq    $64,%rcx
 325         jne     1b
 326
 327         sfence                          // required by non-temporal stores
 328         jmp     LNoMoreChunks
 329
 330 // Handle leftovers: loop by 16.
 331 //      %edx = length remaining (<64)
 332 //      %edi = ptr (aligned)
 333 //      %xmm0 = rotated pattern
 334
 335 LLoopBy16:
 336         movdqa  %xmm0,(%rdi)            // pack in 16 more bytes
 337         subl    $16,%edx                // decrement count
 338         addq    $16,%rdi                // increment ptr
 339 LNoMoreChunks:
 340         cmpl    $16,%edx                // more to go?
 341         jge     LLoopBy16               // yes
 342         jmp     LLessThan16             // handle up to 15 remaining bytes