osfmk/i386/commpage/bzero_sse3.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <machine/cpu_capabilities.h>
  24 #include <machine/commpage.h>
  25
  26 /*
  27  * Bzero, tuned for Pentium-M class processors with SSE3
  28  * and 64-byte cache lines.
  29  *
  30  * This routine is also used for memset(p,0,n), which is a common case
  31  * since gcc sometimes silently maps bzero() into memset().  As a result,
  32  * we always load the original ptr into %eax before returning.
  33  */
  34
  35 #define kShort          80              // too short to bother with SSE (must be >=80)
  36 #define kVeryLong       (1024*1024)
  37
  38
  39         .text
  40         .align  5, 0x90
  41 Lbzero_sse3:                            // void bzero(void *b, size_t len);
  42         pushl   %ebp                    // set up a frame for backtraces
  43         movl    %esp,%ebp
  44         pushl   %edi
  45         movl    8(%ebp),%edi            // get ptr
  46         movl    12(%ebp),%edx           // get length
  47
  48         xorl    %eax,%eax               // set fill data to 0
  49         cmpl    $(kShort),%edx          // long enough for SSE?
  50         jg      LNotShort               // yes
  51
  52 // Here for short operands or the end of long ones.
  53 //      %edx = length
  54 //      %edi = ptr
  55 //      %eax = zero
  56
  57 Lshort:
  58         cmpl    $16,%edx                // long enough to word align?
  59         jge     3f                      // yes
  60         test    %edx,%edx               // length==0?
  61         jz      6f
  62 1:
  63         movb    %al,(%edi)              // zero a byte
  64         inc     %edi
  65         dec     %edx
  66         jnz     1b
  67         jmp     6f
  68 2:
  69         movb    %al,(%edi)              // zero a byte
  70         inc     %edi
  71         dec     %edx
  72 3:
  73         test    $3,%edi                 // is ptr doubleword aligned?
  74         jnz     2b                      // no
  75         movl    %edx,%ecx               // copy length
  76         shrl    $2,%edx                 // #doublewords to store
  77 4:
  78         movl    %eax,(%edi)             // zero an aligned doubleword
  79         addl    $4,%edi
  80         dec     %edx
  81         jnz     4b
  82         andl    $3,%ecx                 // mask down to #bytes at end (0..3)
  83         jz      6f                      // none
  84 5:
  85         movb    %al,(%edi)              // zero a byte
  86         inc     %edi
  87         dec     %ecx
  88         jnz     5b
  89 6:
  90         movl    8(%ebp),%eax            // get return value in case this was a call of memset()
  91         popl    %edi
  92         popl    %ebp
  93         ret
  94
  95
  96 // We will be using SSE, so align ptr.
  97
  98 LNotShort:
  99         movl    %edi,%ecx
 100         negl    %ecx
 101         andl    $15,%ecx                // mask down to #bytes to 16-byte align
 102         jz      LDestAligned            // already aligned
 103         subl    %ecx,%edx               // decrement length
 104 0:                                      // loop storing bytes to align the ptr
 105         movb    %al,(%edi)              // pack in a byte
 106         inc     %edi
 107         dec     %ecx
 108         jnz     0b
 109
 110 // Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
 111 //      %edx = length
 112 //      %edi = ptr
 113 //      %eax = zero
 114
 115 LDestAligned:
 116         movl    %edx,%ecx
 117         andl    $63,%edx                // mask down to residual length (0..63)
 118         andl    $-64,%ecx               // get #bytes we will zero in this loop
 119         pxor    %xmm0,%xmm0             // zero an SSE register
 120         addl    %ecx,%edi               // increment ptr by length to move
 121         cmpl    $(kVeryLong),%ecx       // long enough to justify non-temporal stores?
 122         jae     LVeryLong               // yes
 123         negl    %ecx                    // negate length to move
 124         jmp     1f
 125
 126 // Loop over 64-byte chunks, storing into cache.
 127
 128         .align  4,0x90                  // keep inner loops 16-byte aligned
 129 1:
 130         movdqa  %xmm0,(%edi,%ecx)
 131         movdqa  %xmm0,16(%edi,%ecx)
 132         movdqa  %xmm0,32(%edi,%ecx)
 133         movdqa  %xmm0,48(%edi,%ecx)
 134         addl    $64,%ecx
 135         jne     1b
 136
 137         jmp     Lshort
 138
 139 // Very long operands: use non-temporal stores to bypass cache.
 140
 141 LVeryLong:
 142         negl    %ecx                    // negate length to move
 143         jmp     1f
 144
 145         .align  4,0x90                  // keep inner loops 16-byte aligned
 146 1:
 147         movntdq %xmm0,(%edi,%ecx)
 148         movntdq %xmm0,16(%edi,%ecx)
 149         movntdq %xmm0,32(%edi,%ecx)
 150         movntdq %xmm0,48(%edi,%ecx)
 151         addl    $64,%ecx
 152         jne     1b
 153
 154         sfence                          // required by non-temporal stores
 155         jmp     Lshort
 156
 157
 158         COMMPAGE_DESCRIPTOR(bzero_sse3,_COMM_PAGE_BZERO,kHasSSE2,0)