Libc-763.11.tar.gz

[apple/libc.git] / x86_64 / string / memset.s
diff --git a/x86_64/string/memset.s b/x86_64/string/memset.s

index 423db2f483dfb8ff32dcb1575560552d180f010e..e79c214b3db54e8747e06b16042a097a0f3317cb 100644 (file)
--- a/x86_64/string/memset.s
+++ b/x86_64/string/memset.s
@@ -21,7 +21,7 @@
   */
  
  #include <machine/cpu_capabilities.h>
- 
+
  
  /* This file contains the following functions:
   *
@@ -31,7 +31,7 @@
   *     void memset_pattern16(void *b, const void *c16, size_t len);
   *
   * Calls of memset() with c==0 are routed to the bzero() routine.  Most of the
- * others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows:
+ * others go to _memset_pattern, which is entered as follows:
   *     %rdi = ptr to memory to set (aligned)
   *     %edx = length (which can be short, though we bias in favor of long operands)
   *     %xmm0 = the pattern to store
@@ -41,24 +41,23 @@
   * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
   * on P4s and probably other processors.
   */
- 
- #define kShort        255                     // for nonzero memset(), too short for commpage
- 
- 
+
+#define kShort 255                     // for nonzero memset(), too short for commpage
+
+
         .text
         .globl  _memset
         .align  2
  _memset:                               // void *memset(void *b, int c, size_t len);
         andl    $0xFF,%esi              // (c==0) ?
         jnz             LNonzero                // not a bzero
-       
-       movq    $(_COMM_PAGE_BZERO),%rax// map memset(p,0,n) into bzero(p,n)
+
         movq    %rdx,%rsi               // put count where bzero() expects it
-       jmp             *%rax                   // enter commpage
+       jmp             _bzero                  // enter _bzero
+
  
+// Handle memset of a nonzero value.
  
-       // Handle memset of a nonzero value.
-       
  LNonzero:
         movq    %rdi,%r8                // preserve the original pointer so we can return it
         movl    %esi,%eax               // replicate byte in %esi into all four bytes
@@ -69,12 +68,12 @@ LNonzero:
         orl             %esi,%eax               // now %eax has "c" in all 4 bytes
         cmpq    $(kShort),%rdx          // is operand too short for SSE?
         ja              LCallCommpage           // no
-       
-// Nonzero memset() too short to call commpage.
-//     %eax = replicated 4-byte pattern
-//     %rdi = ptr
-//     %edx = length (<= kShort)
-       
+
+       // Nonzero memset() too short to call commpage.
+       //      %eax = replicated 4-byte pattern
+       //      %rdi = ptr
+       //      %edx = length (<= kShort)
+
         cmpl    $16,%edx                // long enough to word align?
         jge     3f                      // yes
         test    %edx,%edx               // length==0?
@@ -109,12 +108,12 @@ LNonzero:
  6:
         movq    %r8,%rax                // get return value (ie, original ptr)
         ret
-       
-// Nonzero memset() is long enough to call commpage.
-//     %eax = replicated 4-byte pattern
-//     %rdi = ptr
-//     %rdx = length (> kShort)
-       
+
+       // Nonzero memset() is long enough to call commpage.
+       //      %eax = replicated 4-byte pattern
+       //      %rdi = ptr
+       //      %rdx = length (> kShort)
+
  LCallCommpage:
         movd    %eax,%xmm0              // move %eax to low 4 bytes of %xmm0
         pshufd  $(0x00),%xmm0,%xmm0     // replicate across the vector
@@ -129,14 +128,13 @@ LCallCommpage:
         subl    $1,%ecx
         jnz     1b
  2:                                     // ptr aligned, length long enough to justify
-       movq    $(_COMM_PAGE_MEMSET_PATTERN),%rax
-       call    *%rax                   // call commpage to do the heavy lifting
+       call    Lmemset_pattern // call commpage to do the heavy lifting
         movq    %r8,%rax                // get return value (ie, original ptr)
         ret
  
  
-// Handle memset of a 16-byte pattern.
-       
+       // Handle memset of a 16-byte pattern.
+
         .globl  _memset_pattern16
         .align  2, 0x90
  _memset_pattern16:                     // void memset_pattern16(void *b, const void *c16, size_t len);
@@ -144,8 +142,8 @@ _memset_pattern16:                  // void memset_pattern16(void *b, const void *c16, size_t l
         jmp     LAlignPtr
  
  
-// Handle memset of an 8-byte pattern.
-       
+       // Handle memset of an 8-byte pattern.
+
         .globl  _memset_pattern8
         .align  2, 0x90
  _memset_pattern8:                      // void memset_pattern8(void *b, const void *c8, size_t len);
@@ -153,8 +151,8 @@ _memset_pattern8:                   // void memset_pattern8(void *b, const void *c8, size_t len)
         punpcklqdq %xmm0,%xmm0          // replicate into all 16
         jmp     LAlignPtr
  
-// Handle memset of a 4-byte pattern.
-       
+       // Handle memset of a 4-byte pattern.
+
         .globl  _memset_pattern4
         .align  2, 0x90
  _memset_pattern4:                      // void memset_pattern4(void *b, const void *c4, size_t len);
@@ -162,13 +160,13 @@ _memset_pattern4:                 // void memset_pattern4(void *b, const void *c4, size_t len)
         pshufd  $(0x00),%xmm0,%xmm0     // replicate the 4 bytes across the vector
  
  
-// Align ptr if necessary.  We must rotate the pattern right for each byte we
-// store while aligning the ptr.  Since there is no rotate instruction in SSE3,
-// we have to synthesize the rotates.
-//     %rdi = ptr
-//     %rdx = length
-//     %xmm0 = pattern
-       
+       // Align ptr if necessary.  We must rotate the pattern right for each byte we
+       // store while aligning the ptr.  Since there is no rotate instruction in SSE3,
+       // we have to synthesize the rotates.
+       //      %rdi = ptr
+       //      %rdx = length
+       //      %xmm0 = pattern
+
  LAlignPtr:                             // NB: can drop down to here!
         cmpq    $100,%rdx               // long enough to bother aligning ptr?
         movq    %rdi,%rcx               // copy ptr
@@ -177,7 +175,7 @@ LAlignPtr:                          // NB: can drop down to here!
         andl    $15,%ecx                // get #bytes to align ptr
         jz      LReady                  // already aligned
         subq    %rcx,%rdx               // adjust length
-       
+
         test    $1,%cl                  // 1-byte store required?
         movd    %xmm0,%eax              // get 4 low bytes in %eax
         jz      2f                      // no
@@ -207,10 +205,138 @@ LAlignPtr:                               // NB: can drop down to here!
         movq    %xmm0,(%rdi)            // store low 8 bytes of %xmm0
         pshufd  $(0x4e),%xmm0,%xmm0     // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
         addq    $8,%rdi                 // adjust ptr
-       
-// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
+
+       // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
  
  LReady:
-       movq    $(_COMM_PAGE_MEMSET_PATTERN),%rax
-       call    *%rax                   // call commpage to do the heavy lifting
+       call    Lmemset_pattern // call commpage to do the heavy lifting
         ret
+
+
+#define        kLShort         63
+#define        kVeryLong       (1024*1024)
+
+Lmemset_pattern:
+       cmpq    $(kLShort),%rdx         // long enough to bother aligning?
+       ja      LNotShort               // yes
+       jmp     LShort                  // no
+
+       // Here for short operands or the end of long ones.
+       //      %rdx = length (<= kLShort)
+       //      %rdi = ptr (may not be not aligned)
+       //      %xmm0 = pattern
+
+LUnalignedStore16:
+       movdqu  %xmm0,(%rdi)            // stuff in another 16 bytes
+       subl    $16,%edx
+       addq    $16,%rdi
+LShort:        
+       cmpl    $16,%edx                // room for another vector?
+       jge     LUnalignedStore16       // yes
+LLessThan16:                           // here at end of copy with < 16 bytes remaining
+       test    $8,%dl                  // 8-byte store required?
+       jz      2f                      // no
+       movq    %xmm0,(%rdi)            // pack in 8 low bytes
+       psrldq  $8,%xmm0                // then shift vector down 8 bytes
+       addq    $8,%rdi
+2:
+       test    $4,%dl                  // 4-byte store required?
+       jz      3f                      // no
+       movd    %xmm0,(%rdi)            // pack in 4 low bytes
+       psrldq  $4,%xmm0                // then shift vector down 4 bytes
+       addq    $4,%rdi
+3:
+       andl    $3,%edx                 // more to go?
+       jz      5f                      // no
+       movd    %xmm0,%eax              // move remainders out into %eax
+4:                                     // loop on up to three bytes
+       movb    %al,(%rdi)              // pack in next byte
+       shrl    $8,%eax                 // shift next byte into position
+       incq    %rdi
+       dec     %edx
+       jnz     4b
+5:     ret
+
+// Long enough to justify aligning ptr.  Note that we have to rotate the
+// pattern to account for any alignment.  We do this by doing two unaligned
+// stores, and then an aligned load from the middle of the two stores.
+// This will stall on store forwarding alignment mismatch, and the unaligned
+// stores can be pretty slow too, but the alternatives aren't any better.
+// Fortunately, in most cases our caller has already aligned the ptr.
+//      %rdx = length (> kLShort)
+//      %rdi = ptr (may not be aligned)
+//      %xmm0 = pattern
+
+LNotShort:
+       movl    %edi,%ecx               // copy low bits of dest ptr
+       negl    %ecx
+       andl    $15,%ecx                // mask down to #bytes to 16-byte align
+       jz      LAligned                // skip if already aligned
+       movdqu  %xmm0,(%rdi)            // store 16 unaligned bytes
+       movdqu  %xmm0,16(%rdi)          // and 16 more, to be sure we have an aligned chunk
+       addq    %rcx,%rdi               // now point to the aligned chunk
+       subq    %rcx,%rdx               // adjust remaining count
+       movdqa  (%rdi),%xmm0            // get the rotated pattern (probably stalling)
+       addq    $16,%rdi                // skip past the aligned chunk
+       subq    $16,%rdx
+
+// Set up for 64-byte loops.
+//      %rdx = length remaining
+//      %rdi = ptr (aligned)
+//      %xmm0 = rotated pattern
+
+LAligned:
+       movq    %rdx,%rcx               // copy length remaining
+       andl    $63,%edx                // mask down to residual length (0..63)
+       andq    $-64,%rcx               // %ecx <- #bytes we will zero in by-64 loop
+       jz      LNoMoreChunks           // no 64-byte chunks
+       addq    %rcx,%rdi               // increment ptr by length to move
+       cmpq    $(kVeryLong),%rcx       // long enough to justify non-temporal stores?
+       jge     LVeryLong               // yes
+       negq    %rcx                    // negate length to move
+       jmp     1f
+
+// Loop over 64-byte chunks, storing into cache.
+
+       .align  4,0x90                  // keep inner loops 16-byte aligned
+1:
+       movdqa  %xmm0,(%rdi,%rcx)
+       movdqa  %xmm0,16(%rdi,%rcx)
+       movdqa  %xmm0,32(%rdi,%rcx)
+       movdqa  %xmm0,48(%rdi,%rcx)
+       addq    $64,%rcx
+       jne     1b
+
+       jmp     LNoMoreChunks
+
+// Very long operands: use non-temporal stores to bypass cache.
+
+LVeryLong:
+       negq    %rcx                    // negate length to move
+       jmp     1f
+
+       .align  4,0x90                  // keep inner loops 16-byte aligned
+1:
+       movntdq %xmm0,(%rdi,%rcx)
+       movntdq %xmm0,16(%rdi,%rcx)
+       movntdq %xmm0,32(%rdi,%rcx)
+       movntdq %xmm0,48(%rdi,%rcx)
+       addq    $64,%rcx
+       jne     1b
+
+       sfence                          // required by non-temporal stores
+       jmp     LNoMoreChunks
+
+// Handle leftovers: loop by 16.
+//      %edx = length remaining (<64)
+//      %edi = ptr (aligned)
+//      %xmm0 = rotated pattern
+
+LLoopBy16:
+       movdqa  %xmm0,(%rdi)            // pack in 16 more bytes
+       subl    $16,%edx                // decrement count
+       addq    $16,%rdi                // increment ptr
+LNoMoreChunks:
+       cmpl    $16,%edx                // more to go?
+       jge     LLoopBy16               // yes
+       jmp     LLessThan16             // handle up to 15 remaining bytes