osfmk/i386/commpage/memset_pattern_sse3.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <machine/cpu_capabilities.h>
  24 #include <machine/commpage.h>
  25
  26 /* The common path for nonzero memset and the memset_pattern routines,
  27  * tuned for Pentium-M class processors with SSE3 and 64-byte cache lines.
  28  * This is used by the following functions:
  29  *
  30  *      void *memset(void *b, int c, size_t len);                   // when c!=0
  31  *      void memset_pattern4(void *b, const void *c4, size_t len);
  32  *      void memset_pattern8(void *b, const void *c8, size_t len);
  33  *      void memset_pattern16(void *b, const void *c16, size_t len);
  34  *
  35  * Note bzero() and memset() of 0 are handled separately.
  36  */
  37
  38 #define kShort          63
  39 #define kVeryLong       (1024*1024)
  40
  41 // Initial entry from Libc with parameters passed in registers.  Although we
  42 // correctly handle misaligned ptrs and short operands, they are inefficient.
  43 // Therefore our caller should filter out short operands and exploit local
  44 // knowledge (ie, original pattern length) to align the ptr if possible.
  45 // When called, we expect:
  46 //      %edi = ptr to memory to set (not necessarily aligned)
  47 //      %edx = length (may be short or even 0)
  48 //      %xmm0 = the pattern to store
  49 // Return conditions:
  50 //      %eax, %edi, %esi, %ecx, and %edx all trashed
  51
  52         .text
  53         .align  5, 0x90
  54 Lmemset_pattern_sse3:
  55         cmpl    $(kShort),%edx          // long enough to bother aligning?
  56         ja      LNotShort               // yes
  57         jmp     LShort                  // no
  58
  59 // Here for short operands or the end of long ones.
  60 //      %edx = length
  61 //      %edi = ptr (may not be not aligned)
  62 //      %xmm0 = pattern
  63
  64 LUnalignedStore16:
  65         movdqu  %xmm0,(%edi)            // stuff in another 16 bytes
  66         subl    $16,%edx
  67         addl    $16,%edi
  68 LShort:
  69         cmpl    $16,%edx                // room for another vector?
  70         jge     LUnalignedStore16       // yes
  71 LLessThan16:                            // here at end of copy with < 16 bytes remaining
  72         test    $8,%dl                  // 8-byte store required?
  73         jz      2f                      // no
  74         movq    %xmm0,(%edi)            // pack in 8 low bytes
  75         psrldq  $8,%xmm0                // then shift vector down 8 bytes
  76         addl    $8,%edi
  77 2:
  78         test    $4,%dl                  // 4-byte store required?
  79         jz      3f                      // no
  80         movd    %xmm0,(%edi)            // pack in 4 low bytes
  81         psrldq  $4,%xmm0                // then shift vector down 4 bytes
  82         addl    $4,%edi
  83 3:
  84         andl    $3,%edx                 // more to go?
  85         jz      5f                      // no
  86         movd    %xmm0,%eax              // move remainders out into %eax
  87 4:                                      // loop on up to three bytes
  88         movb    %al,(%edi)              // pack in next byte
  89         shrl    $8,%eax                 // shift next byte into position
  90         inc     %edi
  91         dec     %edx
  92         jnz     4b
  93 5:      ret
  94
  95 // Long enough to justify aligning ptr.  Note that we have to rotate the
  96 // pattern to account for any alignment.  We do this by doing two unaligned
  97 // stores, and then an aligned load from the middle of the two stores.
  98 // This will stall on store forwarding alignment mismatch, and the unaligned
  99 // stores can be pretty slow too, but the alternatives aren't any better.
 100 // Fortunately, in most cases our caller has already aligned the ptr.
 101 //      %edx = length (> kShort)
 102 //      %edi = ptr (may not be aligned)
 103 //      %xmm0 = pattern
 104
 105 LNotShort:
 106         movl    %edi,%ecx               // copy dest ptr
 107         negl    %ecx
 108         andl    $15,%ecx                // mask down to #bytes to 16-byte align
 109         jz      LAligned                // skip if already aligned
 110         movdqu  %xmm0,(%edi)            // store 16 unaligned bytes
 111         movdqu  %xmm0,16(%edi)          // and 16 more, to be sure we have an aligned chunk
 112         addl    %ecx,%edi               // now point to the aligned chunk
 113         subl    %ecx,%edx               // adjust remaining count
 114         movdqa  (%edi),%xmm0            // get the rotated pattern (probably stalling)
 115         addl    $16,%edi                // skip past the aligned chunk
 116         subl    $16,%edx
 117
 118 // Set up for 64-byte loops.
 119 //      %edx = length remaining
 120 //      %edi = ptr (aligned)
 121 //      %xmm0 = rotated pattern
 122
 123 LAligned:
 124         movl    %edx,%ecx               // copy length remaining
 125         andl    $63,%edx                // mask down to residual length (0..63)
 126         andl    $-64,%ecx               // %ecx <- #bytes we will zero in by-64 loop
 127         jz      LNoMoreChunks           // no 64-byte chunks
 128         addl    %ecx,%edi               // increment ptr by length to move
 129         cmpl    $(kVeryLong),%ecx       // long enough to justify non-temporal stores?
 130         jge     LVeryLong               // yes
 131         negl    %ecx                    // negate length to move
 132         jmp     1f
 133
 134 // Loop over 64-byte chunks, storing into cache.
 135
 136         .align  4,0x90                  // keep inner loops 16-byte aligned
 137 1:
 138         movdqa  %xmm0,(%edi,%ecx)
 139         movdqa  %xmm0,16(%edi,%ecx)
 140         movdqa  %xmm0,32(%edi,%ecx)
 141         movdqa  %xmm0,48(%edi,%ecx)
 142         addl    $64,%ecx
 143         jne     1b
 144
 145         jmp     LNoMoreChunks
 146
 147 // Very long operands: use non-temporal stores to bypass cache.
 148
 149 LVeryLong:
 150         negl    %ecx                    // negate length to move
 151         jmp     1f
 152
 153         .align  4,0x90                  // keep inner loops 16-byte aligned
 154 1:
 155         movntdq %xmm0,(%edi,%ecx)
 156         movntdq %xmm0,16(%edi,%ecx)
 157         movntdq %xmm0,32(%edi,%ecx)
 158         movntdq %xmm0,48(%edi,%ecx)
 159         addl    $64,%ecx
 160         jne     1b
 161
 162         sfence                          // required by non-temporal stores
 163         jmp     LNoMoreChunks
 164
 165 // Handle leftovers: loop by 16.
 166 //      %edx = length remaining (<64)
 167 //      %edi = ptr (aligned)
 168 //      %xmm0 = rotated pattern
 169
 170 LLoopBy16:
 171         movdqa  %xmm0,(%edi)            // pack in 16 more bytes
 172         subl    $16,%edx                // decrement count
 173         addl    $16,%edi                // increment ptr
 174 LNoMoreChunks:
 175         cmpl    $16,%edx                // more to go?
 176         jge     LLoopBy16               // yes
 177         jmp     LLessThan16             // handle up to 15 remaining bytes
 178
 179         COMMPAGE_DESCRIPTOR(memset_pattern_sse3,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)