x86_64/string/memset.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <machine/cpu_capabilities.h>
  24
  25
  26 /* This file contains the following functions:
  27  *
  28  *      void *memset(void *b, int c, size_t len);
  29  *      void memset_pattern4(void *b, const void *c4, size_t len);
  30  *      void memset_pattern8(void *b, const void *c8, size_t len);
  31  *      void memset_pattern16(void *b, const void *c16, size_t len);
  32  *
  33  * Calls of memset() with c==0 are routed to the bzero() routine.  Most of the
  34  * others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows:
  35  *      %rdi = ptr to memory to set (aligned)
  36  *      %edx = length (which can be short, though we bias in favor of long operands)
  37  *      %xmm0 = the pattern to store
  38  * Return conditions:
  39  *      %eax, %edi, %esi, %ecx, and %edx all trashed
  40  *
  41  * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
  42  * on P4s and probably other processors.
  43  */
  44
  45  #define kShort 255                     // for nonzero memset(), too short for commpage
  46
  47
  48         .text
  49         .globl  _memset
  50         .align  2
  51 _memset:                                // void *memset(void *b, int c, size_t len);
  52         andl    $0xFF,%esi              // (c==0) ?
  53         jnz             LNonzero                // not a bzero
  54
  55         movq    $(_COMM_PAGE_BZERO),%rax// map memset(p,0,n) into bzero(p,n)
  56         movq    %rdx,%rsi               // put count where bzero() expects it
  57         jmp             *%rax                   // enter commpage
  58
  59
  60         // Handle memset of a nonzero value.
  61
  62 LNonzero:
  63         movq    %rdi,%r8                // preserve the original pointer so we can return it
  64         movl    %esi,%eax               // replicate byte in %esi into all four bytes
  65         shll    $8,%esi
  66         orl             %esi,%eax
  67         movl    %eax,%esi
  68         shll    $16,%esi
  69         orl             %esi,%eax               // now %eax has "c" in all 4 bytes
  70         cmpq    $(kShort),%rdx          // is operand too short for SSE?
  71         ja              LCallCommpage           // no
  72
  73 // Nonzero memset() too short to call commpage.
  74 //      %eax = replicated 4-byte pattern
  75 //      %rdi = ptr
  76 //      %edx = length (<= kShort)
  77
  78         cmpl    $16,%edx                // long enough to word align?
  79         jge     3f                      // yes
  80         test    %edx,%edx               // length==0?
  81         jz      6f
  82 1:
  83         movb    %al,(%rdi)              // pack in a byte
  84         addq    $1,%rdi
  85         subl    $1,%edx
  86         jnz     1b
  87         jmp     6f
  88 2:
  89         movb    %al,(%rdi)              // pack in a byte
  90         addq    $1,%rdi
  91         subl    $1,%edx
  92 3:
  93         test    $3,%edi                 // is ptr doubleword aligned?
  94         jnz     2b                      // no
  95         movl    %edx,%ecx               // copy length
  96         shrl    $2,%edx                 // #doublewords to store
  97 4:
  98         movl    %eax,(%rdi)             // store aligned doubleword
  99         addq    $4,%rdi
 100         subl    $1,%edx
 101         jnz     4b
 102         andl    $3,%ecx                 // any leftover bytes?
 103         jz      6f                      // no
 104 5:
 105         movb    %al,(%rdi)              // pack in a byte
 106         addq    $1,%rdi
 107         subl    $1,%ecx
 108         jnz     5b
 109 6:
 110         movq    %r8,%rax                // get return value (ie, original ptr)
 111         ret
 112
 113 // Nonzero memset() is long enough to call commpage.
 114 //      %eax = replicated 4-byte pattern
 115 //      %rdi = ptr
 116 //      %rdx = length (> kShort)
 117
 118 LCallCommpage:
 119         movd    %eax,%xmm0              // move %eax to low 4 bytes of %xmm0
 120         pshufd  $(0x00),%xmm0,%xmm0     // replicate across the vector
 121         movq    %rdi,%rcx               // copy dest ptr
 122         negl    %ecx
 123         andl    $15,%ecx                // get #bytes to align ptr
 124         jz      2f                      // skip if already aligned
 125         subq    %rcx,%rdx               // decrement length
 126 1:
 127         movb    %al,(%rdi)              // pack in a byte
 128         addq    $1,%rdi
 129         subl    $1,%ecx
 130         jnz     1b
 131 2:                                      // ptr aligned, length long enough to justify
 132         movq    $(_COMM_PAGE_MEMSET_PATTERN),%rax
 133         call    *%rax                   // call commpage to do the heavy lifting
 134         movq    %r8,%rax                // get return value (ie, original ptr)
 135         ret
 136
 137
 138 // Handle memset of a 16-byte pattern.
 139
 140         .globl  _memset_pattern16
 141         .align  2, 0x90
 142 _memset_pattern16:                      // void memset_pattern16(void *b, const void *c16, size_t len);
 143         movdqu  (%rsi),%xmm0            // load the pattern
 144         jmp     LAlignPtr
 145
 146
 147 // Handle memset of an 8-byte pattern.
 148
 149         .globl  _memset_pattern8
 150         .align  2, 0x90
 151 _memset_pattern8:                       // void memset_pattern8(void *b, const void *c8, size_t len);
 152         movq    (%rsi),%xmm0            // load pattern into low 8 bytes
 153         punpcklqdq %xmm0,%xmm0          // replicate into all 16
 154         jmp     LAlignPtr
 155
 156 // Handle memset of a 4-byte pattern.
 157
 158         .globl  _memset_pattern4
 159         .align  2, 0x90
 160 _memset_pattern4:                       // void memset_pattern4(void *b, const void *c4, size_t len);
 161         movd    (%rsi),%xmm0            // load pattern into low 4 bytes
 162         pshufd  $(0x00),%xmm0,%xmm0     // replicate the 4 bytes across the vector
 163
 164
 165 // Align ptr if necessary.  We must rotate the pattern right for each byte we
 166 // store while aligning the ptr.  Since there is no rotate instruction in SSE3,
 167 // we have to synthesize the rotates.
 168 //      %rdi = ptr
 169 //      %rdx = length
 170 //      %xmm0 = pattern
 171
 172 LAlignPtr:                              // NB: can drop down to here!
 173         cmpq    $100,%rdx               // long enough to bother aligning ptr?
 174         movq    %rdi,%rcx               // copy ptr
 175         jb      LReady                  // not long enough
 176         negl    %ecx
 177         andl    $15,%ecx                // get #bytes to align ptr
 178         jz      LReady                  // already aligned
 179         subq    %rcx,%rdx               // adjust length
 180
 181         test    $1,%cl                  // 1-byte store required?
 182         movd    %xmm0,%eax              // get 4 low bytes in %eax
 183         jz      2f                      // no
 184         movdqa  %xmm0,%xmm1             // copy pattern so we can shift in both directions
 185         movb    %al,(%rdi)              // pack in the low-order byte
 186         psrldq  $1,%xmm0                // shift pattern right 1 byte
 187         addq    $1,%rdi
 188         pslldq  $15,%xmm1               // shift pattern left 15 bytes
 189         shrl    $8,%eax                 // in case 2-byte store is required
 190         por     %xmm1,%xmm0             // complete right rotate of pattern by 1 byte
 191 2:
 192         test    $2,%cl                  // 2-byte store required?
 193         jz      4f                      // no
 194         psrldq  $2,%xmm0                // shift pattern down 2 bytes
 195         movw    %ax,(%rdi)              // pack in next two bytes
 196         pinsrw  $7,%eax,%xmm0           // insert low word of %eax into high word of %xmm0
 197         addq    $2,%rdi                 // adjust ptr
 198 4:
 199         test    $4,%cl                  // 4-byte store required?
 200         jz      8f                      // no
 201         movd    %xmm0,(%rdi)            // store low 4 bytes of %xmm0
 202         pshufd  $(0x39),%xmm0,%xmm0     // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
 203         addq    $4,%rdi                 // adjust ptr
 204 8:
 205         test    $8,%cl                  // 8-byte store required?
 206         jz      LReady                  // no
 207         movq    %xmm0,(%rdi)            // store low 8 bytes of %xmm0
 208         pshufd  $(0x4e),%xmm0,%xmm0     // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
 209         addq    $8,%rdi                 // adjust ptr
 210
 211 // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
 212
 213 LReady:
 214         movq    $(_COMM_PAGE_MEMSET_PATTERN),%rax
 215         call    *%rax                   // call commpage to do the heavy lifting
 216         ret