osfmk/i386/commpage/memset_pattern_sse2.s

   1 /*
   2  * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /* The common path for nonzero memset and the memset_pattern routines,
  33  * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines.
  34  * This is used by the following functions:
  35  *
  36  *      void *memset(void *b, int c, size_t len);                   // when c!=0
  37  *      void memset_pattern4(void *b, const void *c4, size_t len);
  38  *      void memset_pattern8(void *b, const void *c8, size_t len);
  39  *      void memset_pattern16(void *b, const void *c16, size_t len);
  40  *
  41  * Note bzero() and memset() of 0 are handled separately.
  42  */
  43
  44 #define kShort          63
  45 #define kVeryLong       (1024*1024)
  46
  47 // Initial entry from Libc with parameters passed in registers.  Although we
  48 // correctly handle misaligned ptrs and short operands, they are inefficient.
  49 // Therefore our caller should filter out short operands and exploit local
  50 // knowledge (ie, original pattern length) to align the ptr if possible.
  51 // When called, we expect:
  52 //      %edi = ptr to memory to set (not necessarily aligned)
  53 //      %edx = length (may be short or even 0)
  54 //      %xmm0 = the pattern to store
  55 // Return conditions:
  56 //      %eax, %edi, %esi, %ecx, and %edx all trashed
  57
  58         .text
  59         .align  5, 0x90
  60 Lmemset_pattern_sse2:
  61         cmpl    $(kShort),%edx          // long enough to bother aligning?
  62         ja      LNotShort               // yes
  63         jmp     LShort                  // no
  64
  65 // Here for short operands or the end of long ones.
  66 //      %edx = length
  67 //      %edi = ptr (may not be not aligned)
  68 //      %xmm0 = pattern
  69
  70 LUnalignedStore16:
  71         movdqu  %xmm0,(%edi)            // stuff in another 16 bytes
  72         subl    $16,%edx
  73         addl    $16,%edi
  74 LShort:
  75         cmpl    $16,%edx                // room for another vector?
  76         jge     LUnalignedStore16       // yes
  77 LLessThan16:                            // here at end of copy with < 16 bytes remaining
  78         test    $8,%dl                  // 8-byte store required?
  79         jz      2f                      // no
  80         movq    %xmm0,(%edi)            // pack in 8 low bytes
  81         psrldq  $8,%xmm0                // then shift vector down 8 bytes
  82         addl    $8,%edi
  83 2:
  84         test    $4,%dl                  // 4-byte store required?
  85         jz      3f                      // no
  86         movd    %xmm0,(%edi)            // pack in 4 low bytes
  87         psrldq  $4,%xmm0                // then shift vector down 4 bytes
  88         addl    $4,%edi
  89 3:
  90         andl    $3,%edx                 // more to go?
  91         jz      5f                      // no
  92         movd    %xmm0,%eax              // move remainders out into %eax
  93 4:                                      // loop on up to three bytes
  94         movb    %al,(%edi)              // pack in next byte
  95         shrl    $8,%eax                 // shift next byte into position
  96         inc     %edi
  97         dec     %edx
  98         jnz     4b
  99 5:      ret
 100
 101 // Long enough to justify aligning ptr.  Note that we have to rotate the
 102 // pattern to account for any alignment.  We do this by doing two unaligned
 103 // stores, and then an aligned load from the middle of the two stores.
 104 // This will stall on store forwarding alignment mismatch, and the unaligned
 105 // stores can be pretty slow too, but the alternatives aren't any better.
 106 // Fortunately, in most cases our caller has already aligned the ptr.
 107 //      %edx = length (> kShort)
 108 //      %edi = ptr (may not be aligned)
 109 //      %xmm0 = pattern
 110
 111 LNotShort:
 112         movl    %edi,%ecx               // copy dest ptr
 113         negl    %ecx
 114         andl    $15,%ecx                // mask down to #bytes to 16-byte align
 115         jz      LAligned                // skip if already aligned
 116         movdqu  %xmm0,(%edi)            // store 16 unaligned bytes
 117         movdqu  %xmm0,16(%edi)          // and 16 more, to be sure we have an aligned chunk
 118         addl    %ecx,%edi               // now point to the aligned chunk
 119         subl    %ecx,%edx               // adjust remaining count
 120         movdqa  (%edi),%xmm0            // get the rotated pattern (probably stalling)
 121         addl    $16,%edi                // skip past the aligned chunk
 122         subl    $16,%edx
 123
 124 // Set up for 64-byte loops.
 125 //      %edx = length remaining
 126 //      %edi = ptr (aligned)
 127 //      %xmm0 = rotated pattern
 128
 129 LAligned:
 130         movl    %edx,%ecx               // copy length remaining
 131         andl    $63,%edx                // mask down to residual length (0..63)
 132         andl    $-64,%ecx               // %ecx <- #bytes we will zero in by-64 loop
 133         jz      LNoMoreChunks           // no 64-byte chunks
 134         addl    %ecx,%edi               // increment ptr by length to move
 135         cmpl    $(kVeryLong),%ecx       // long enough to justify non-temporal stores?
 136         jge     LVeryLong               // yes
 137         negl    %ecx                    // negate length to move
 138         jmp     1f
 139
 140 // Loop over 64-byte chunks, storing into cache.
 141
 142         .align  4,0x90                  // keep inner loops 16-byte aligned
 143 1:
 144         movdqa  %xmm0,(%edi,%ecx)
 145         movdqa  %xmm0,16(%edi,%ecx)
 146         movdqa  %xmm0,32(%edi,%ecx)
 147         movdqa  %xmm0,48(%edi,%ecx)
 148         addl    $64,%ecx
 149         jne     1b
 150
 151         jmp     LNoMoreChunks
 152
 153 // Very long operands: use non-temporal stores to bypass cache.
 154
 155 LVeryLong:
 156         negl    %ecx                    // negate length to move
 157         jmp     1f
 158
 159         .align  4,0x90                  // keep inner loops 16-byte aligned
 160 1:
 161         movntdq %xmm0,(%edi,%ecx)
 162         movntdq %xmm0,16(%edi,%ecx)
 163         movntdq %xmm0,32(%edi,%ecx)
 164         movntdq %xmm0,48(%edi,%ecx)
 165         addl    $64,%ecx
 166         jne     1b
 167
 168         sfence                          // required by non-temporal stores
 169         jmp     LNoMoreChunks
 170
 171 // Handle leftovers: loop by 16.
 172 //      %edx = length remaining (<64)
 173 //      %edi = ptr (aligned)
 174 //      %xmm0 = rotated pattern
 175
 176 LLoopBy16:
 177         movdqa  %xmm0,(%edi)            // pack in 16 more bytes
 178         subl    $16,%edx                // decrement count
 179         addl    $16,%edi                // increment ptr
 180 LNoMoreChunks:
 181         cmpl    $16,%edx                // more to go?
 182         jge     LLoopBy16               // yes
 183         jmp     LLessThan16             // handle up to 15 remaining bytes
 184
 185         COMMPAGE_DESCRIPTOR(memset_pattern_sse2,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)