osfmk/i386/commpage/memset_pattern_sse2_64.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /* The common path for nonzero memset and the memset_pattern routines,
  33  * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines.
  34  * This is the 64-bit bersion.  It is used by the following functions:
  35  *
  36  *      void *memset(void *b, int c, size_t len);                   // when c!=0
  37  *      void memset_pattern4(void *b, const void *c4, size_t len);
  38  *      void memset_pattern8(void *b, const void *c8, size_t len);
  39  *      void memset_pattern16(void *b, const void *c16, size_t len);
  40  *
  41  * Note bzero() and memset() of 0 are handled separately.
  42  */
  43
  44 #define kShort          63
  45 #define kVeryLong       (1024*1024)
  46
  47 // Initial entry from Libc with parameters passed in registers.  Although we
  48 // correctly handle misaligned ptrs and short operands, they are inefficient.
  49 // Therefore our caller should filter out short operands and exploit local
  50 // knowledge (ie, original pattern length) to align the ptr if possible.
  51 // When called, we expect:
  52 //      %rdi = ptr to memory to set (not necessarily aligned)
  53 //      %rdx = length (may be short or even 0)
  54 //      %xmm0 = the pattern to store
  55 // Return conditions:
  56 //      %rax, %rdi, %rsi, %rcx, and %rdx all trashed
  57 //      we preserve %r8, %r9, %r10, and %r11
  58
  59 COMMPAGE_FUNCTION_START(memset_pattern_sse2_64, 64, 5)
  60         cmpq    $(kShort),%rdx          // long enough to bother aligning?
  61         ja      LNotShort               // yes
  62         jmp     LShort                  // no
  63
  64 // Here for short operands or the end of long ones.
  65 //      %rdx = length (<= kShort)
  66 //      %rdi = ptr (may not be not aligned)
  67 //      %xmm0 = pattern
  68
  69 LUnalignedStore16:
  70         movdqu  %xmm0,(%rdi)            // stuff in another 16 bytes
  71         subl    $16,%edx
  72         addq    $16,%rdi
  73 LShort:
  74         cmpl    $16,%edx                // room for another vector?
  75         jge     LUnalignedStore16       // yes
  76 LLessThan16:                            // here at end of copy with < 16 bytes remaining
  77         test    $8,%dl                  // 8-byte store required?
  78         jz      2f                      // no
  79         movq    %xmm0,(%rdi)            // pack in 8 low bytes
  80         psrldq  $8,%xmm0                // then shift vector down 8 bytes
  81         addq    $8,%rdi
  82 2:
  83         test    $4,%dl                  // 4-byte store required?
  84         jz      3f                      // no
  85         movd    %xmm0,(%rdi)            // pack in 4 low bytes
  86         psrldq  $4,%xmm0                // then shift vector down 4 bytes
  87         addq    $4,%rdi
  88 3:
  89         andl    $3,%edx                 // more to go?
  90         jz      5f                      // no
  91         movd    %xmm0,%eax              // move remainders out into %eax
  92 4:                                      // loop on up to three bytes
  93         movb    %al,(%rdi)              // pack in next byte
  94         shrl    $8,%eax                 // shift next byte into position
  95         incq    %rdi
  96         dec     %edx
  97         jnz     4b
  98 5:      ret
  99
 100 // Long enough to justify aligning ptr.  Note that we have to rotate the
 101 // pattern to account for any alignment.  We do this by doing two unaligned
 102 // stores, and then an aligned load from the middle of the two stores.
 103 // This will stall on store forwarding alignment mismatch, and the unaligned
 104 // stores can be pretty slow too, but the alternatives aren't any better.
 105 // Fortunately, in most cases our caller has already aligned the ptr.
 106 //      %rdx = length (> kShort)
 107 //      %rdi = ptr (may not be aligned)
 108 //      %xmm0 = pattern
 109
 110 LNotShort:
 111         movl    %edi,%ecx               // copy low bits of dest ptr
 112         negl    %ecx
 113         andl    $15,%ecx                // mask down to #bytes to 16-byte align
 114         jz      LAligned                // skip if already aligned
 115         movdqu  %xmm0,(%rdi)            // store 16 unaligned bytes
 116         movdqu  %xmm0,16(%rdi)          // and 16 more, to be sure we have an aligned chunk
 117         addq    %rcx,%rdi               // now point to the aligned chunk
 118         subq    %rcx,%rdx               // adjust remaining count
 119         movdqa  (%rdi),%xmm0            // get the rotated pattern (probably stalling)
 120         addq    $16,%rdi                // skip past the aligned chunk
 121         subq    $16,%rdx
 122
 123 // Set up for 64-byte loops.
 124 //      %rdx = length remaining
 125 //      %rdi = ptr (aligned)
 126 //      %xmm0 = rotated pattern
 127
 128 LAligned:
 129         movq    %rdx,%rcx               // copy length remaining
 130         andl    $63,%edx                // mask down to residual length (0..63)
 131         andq    $-64,%rcx               // %ecx <- #bytes we will zero in by-64 loop
 132         jz      LNoMoreChunks           // no 64-byte chunks
 133         addq    %rcx,%rdi               // increment ptr by length to move
 134         cmpq    $(kVeryLong),%rcx       // long enough to justify non-temporal stores?
 135         jge     LVeryLong               // yes
 136         negq    %rcx                    // negate length to move
 137         jmp     1f
 138
 139 // Loop over 64-byte chunks, storing into cache.
 140
 141         .align  4,0x90                  // keep inner loops 16-byte aligned
 142 1:
 143         movdqa  %xmm0,(%rdi,%rcx)
 144         movdqa  %xmm0,16(%rdi,%rcx)
 145         movdqa  %xmm0,32(%rdi,%rcx)
 146         movdqa  %xmm0,48(%rdi,%rcx)
 147         addq    $64,%rcx
 148         jne     1b
 149
 150         jmp     LNoMoreChunks
 151
 152 // Very long operands: use non-temporal stores to bypass cache.
 153
 154 LVeryLong:
 155         negq    %rcx                    // negate length to move
 156         jmp     1f
 157
 158         .align  4,0x90                  // keep inner loops 16-byte aligned
 159 1:
 160         movntdq %xmm0,(%rdi,%rcx)
 161         movntdq %xmm0,16(%rdi,%rcx)
 162         movntdq %xmm0,32(%rdi,%rcx)
 163         movntdq %xmm0,48(%rdi,%rcx)
 164         addq    $64,%rcx
 165         jne     1b
 166
 167         sfence                          // required by non-temporal stores
 168         jmp     LNoMoreChunks
 169
 170 // Handle leftovers: loop by 16.
 171 //      %edx = length remaining (<64)
 172 //      %edi = ptr (aligned)
 173 //      %xmm0 = rotated pattern
 174
 175 LLoopBy16:
 176         movdqa  %xmm0,(%rdi)            // pack in 16 more bytes
 177         subl    $16,%edx                // decrement count
 178         addq    $16,%rdi                // increment ptr
 179 LNoMoreChunks:
 180         cmpl    $16,%edx                // more to go?
 181         jge     LLoopBy16               // yes
 182         jmp     LLessThan16             // handle up to 15 remaining bytes
 183
 184 COMMPAGE_DESCRIPTOR(memset_pattern_sse2_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)