osfmk/i386/commpage/memset_pattern_sse3_64.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <machine/cpu_capabilities.h>
  24 #include <machine/commpage.h>
  25
  26 /* The common path for nonzero memset and the memset_pattern routines,
  27  * tuned for Pentium-M class processors with SSE3 and 64-byte cache lines.
  28  * This is the 64-bit bersion.  It is used by the following functions:
  29  *
  30  *      void *memset(void *b, int c, size_t len);                   // when c!=0
  31  *      void memset_pattern4(void *b, const void *c4, size_t len);
  32  *      void memset_pattern8(void *b, const void *c8, size_t len);
  33  *      void memset_pattern16(void *b, const void *c16, size_t len);
  34  *
  35  * Note bzero() and memset() of 0 are handled separately.
  36  */
  37
  38 #define kShort          63
  39 #define kVeryLong       (1024*1024)
  40
  41 // Initial entry from Libc with parameters passed in registers.  Although we
  42 // correctly handle misaligned ptrs and short operands, they are inefficient.
  43 // Therefore our caller should filter out short operands and exploit local
  44 // knowledge (ie, original pattern length) to align the ptr if possible.
  45 // When called, we expect:
  46 //      %rdi = ptr to memory to set (not necessarily aligned)
  47 //      %rdx = length (may be short or even 0)
  48 //      %xmm0 = the pattern to store
  49 // Return conditions:
  50 //      %rax, %rdi, %rsi, %rcx, and %rdx all trashed
  51 //      we preserve %r8, %r9, %r10, and %r11
  52
  53         .text
  54         .align  5, 0x90
  55         .code64
  56 Lmemset_pattern_sse3_64:
  57         cmpq    $(kShort),%rdx          // long enough to bother aligning?
  58         ja      LNotShort               // yes
  59         jmp     LShort                  // no
  60
  61 // Here for short operands or the end of long ones.
  62 //      %rdx = length (<= kShort)
  63 //      %rdi = ptr (may not be not aligned)
  64 //      %xmm0 = pattern
  65
  66 LUnalignedStore16:
  67         movdqu  %xmm0,(%rdi)            // stuff in another 16 bytes
  68         subl    $16,%edx
  69         addq    $16,%rdi
  70 LShort:
  71         cmpl    $16,%edx                // room for another vector?
  72         jge     LUnalignedStore16       // yes
  73 LLessThan16:                            // here at end of copy with < 16 bytes remaining
  74         test    $8,%dl                  // 8-byte store required?
  75         jz      2f                      // no
  76         movq    %xmm0,(%rdi)            // pack in 8 low bytes
  77         psrldq  $8,%xmm0                // then shift vector down 8 bytes
  78         addq    $8,%rdi
  79 2:
  80         test    $4,%dl                  // 4-byte store required?
  81         jz      3f                      // no
  82         movd    %xmm0,(%rdi)            // pack in 4 low bytes
  83         psrldq  $4,%xmm0                // then shift vector down 4 bytes
  84         addq    $4,%rdi
  85 3:
  86         andl    $3,%edx                 // more to go?
  87         jz      5f                      // no
  88         movd    %xmm0,%eax              // move remainders out into %eax
  89 4:                                      // loop on up to three bytes
  90         movb    %al,(%rdi)              // pack in next byte
  91         shrl    $8,%eax                 // shift next byte into position
  92         incq    %rdi
  93         dec     %edx
  94         jnz     4b
  95 5:      ret
  96
  97 // Long enough to justify aligning ptr.  Note that we have to rotate the
  98 // pattern to account for any alignment.  We do this by doing two unaligned
  99 // stores, and then an aligned load from the middle of the two stores.
 100 // This will stall on store forwarding alignment mismatch, and the unaligned
 101 // stores can be pretty slow too, but the alternatives aren't any better.
 102 // Fortunately, in most cases our caller has already aligned the ptr.
 103 //      %rdx = length (> kShort)
 104 //      %rdi = ptr (may not be aligned)
 105 //      %xmm0 = pattern
 106
 107 LNotShort:
 108         movl    %edi,%ecx               // copy low bits of dest ptr
 109         negl    %ecx
 110         andl    $15,%ecx                // mask down to #bytes to 16-byte align
 111         jz      LAligned                // skip if already aligned
 112         movdqu  %xmm0,(%rdi)            // store 16 unaligned bytes
 113         movdqu  %xmm0,16(%rdi)          // and 16 more, to be sure we have an aligned chunk
 114         addq    %rcx,%rdi               // now point to the aligned chunk
 115         subq    %rcx,%rdx               // adjust remaining count
 116         movdqa  (%rdi),%xmm0            // get the rotated pattern (probably stalling)
 117         addq    $16,%rdi                // skip past the aligned chunk
 118         subq    $16,%rdx
 119
 120 // Set up for 64-byte loops.
 121 //      %rdx = length remaining
 122 //      %rdi = ptr (aligned)
 123 //      %xmm0 = rotated pattern
 124
 125 LAligned:
 126         movq    %rdx,%rcx               // copy length remaining
 127         andl    $63,%edx                // mask down to residual length (0..63)
 128         andq    $-64,%rcx               // %ecx <- #bytes we will zero in by-64 loop
 129         jz      LNoMoreChunks           // no 64-byte chunks
 130         addq    %rcx,%rdi               // increment ptr by length to move
 131         cmpq    $(kVeryLong),%rcx       // long enough to justify non-temporal stores?
 132         jge     LVeryLong               // yes
 133         negq    %rcx                    // negate length to move
 134         jmp     1f
 135
 136 // Loop over 64-byte chunks, storing into cache.
 137
 138         .align  4,0x90                  // keep inner loops 16-byte aligned
 139 1:
 140         movdqa  %xmm0,(%rdi,%rcx)
 141         movdqa  %xmm0,16(%rdi,%rcx)
 142         movdqa  %xmm0,32(%rdi,%rcx)
 143         movdqa  %xmm0,48(%rdi,%rcx)
 144         addq    $64,%rcx
 145         jne     1b
 146
 147         jmp     LNoMoreChunks
 148
 149 // Very long operands: use non-temporal stores to bypass cache.
 150
 151 LVeryLong:
 152         negq    %rcx                    // negate length to move
 153         jmp     1f
 154
 155         .align  4,0x90                  // keep inner loops 16-byte aligned
 156 1:
 157         movntdq %xmm0,(%rdi,%rcx)
 158         movntdq %xmm0,16(%rdi,%rcx)
 159         movntdq %xmm0,32(%rdi,%rcx)
 160         movntdq %xmm0,48(%rdi,%rcx)
 161         addq    $64,%rcx
 162         jne     1b
 163
 164         sfence                          // required by non-temporal stores
 165         jmp     LNoMoreChunks
 166
 167 // Handle leftovers: loop by 16.
 168 //      %edx = length remaining (<64)
 169 //      %edi = ptr (aligned)
 170 //      %xmm0 = rotated pattern
 171
 172 LLoopBy16:
 173         movdqa  %xmm0,(%rdi)            // pack in 16 more bytes
 174         subl    $16,%edx                // decrement count
 175         addq    $16,%rdi                // increment ptr
 176 LNoMoreChunks:
 177         cmpl    $16,%edx                // more to go?
 178         jge     LLoopBy16               // yes
 179         jmp     LLessThan16             // handle up to 15 remaining bytes
 180
 181         COMMPAGE_DESCRIPTOR(memset_pattern_sse3_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE3,0)