osfmk/i386/commpage/bzero_sse3.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30
  31 #include <machine/cpu_capabilities.h>
  32 #include <machine/commpage.h>
  33
  34 /*
  35  * Bzero, tuned for Pentium-M class processors with SSE3
  36  * and 64-byte cache lines.
  37  *
  38  * This routine is also used for memset(p,0,n), which is a common case
  39  * since gcc sometimes silently maps bzero() into memset().  As a result,
  40  * we always load the original ptr into %eax before returning.
  41  */
  42
  43 #define kShort          80              // too short to bother with SSE (must be >=80)
  44 #define kVeryLong       (1024*1024)
  45
  46
  47         .text
  48         .align  5, 0x90
  49 Lbzero_sse3:                            // void bzero(void *b, size_t len);
  50         pushl   %ebp                    // set up a frame for backtraces
  51         movl    %esp,%ebp
  52         pushl   %edi
  53         movl    8(%ebp),%edi            // get ptr
  54         movl    12(%ebp),%edx           // get length
  55
  56         xorl    %eax,%eax               // set fill data to 0
  57         cmpl    $(kShort),%edx          // long enough for SSE?
  58         jg      LNotShort               // yes
  59
  60 // Here for short operands or the end of long ones.
  61 //      %edx = length
  62 //      %edi = ptr
  63 //      %eax = zero
  64
  65 Lshort:
  66         cmpl    $16,%edx                // long enough to word align?
  67         jge     3f                      // yes
  68         test    %edx,%edx               // length==0?
  69         jz      6f
  70 1:
  71         movb    %al,(%edi)              // zero a byte
  72         inc     %edi
  73         dec     %edx
  74         jnz     1b
  75         jmp     6f
  76 2:
  77         movb    %al,(%edi)              // zero a byte
  78         inc     %edi
  79         dec     %edx
  80 3:
  81         test    $3,%edi                 // is ptr doubleword aligned?
  82         jnz     2b                      // no
  83         movl    %edx,%ecx               // copy length
  84         shrl    $2,%edx                 // #doublewords to store
  85 4:
  86         movl    %eax,(%edi)             // zero an aligned doubleword
  87         addl    $4,%edi
  88         dec     %edx
  89         jnz     4b
  90         andl    $3,%ecx                 // mask down to #bytes at end (0..3)
  91         jz      6f                      // none
  92 5:
  93         movb    %al,(%edi)              // zero a byte
  94         inc     %edi
  95         dec     %ecx
  96         jnz     5b
  97 6:
  98         movl    8(%ebp),%eax            // get return value in case this was a call of memset()
  99         popl    %edi
 100         popl    %ebp
 101         ret
 102
 103
 104 // We will be using SSE, so align ptr.
 105
 106 LNotShort:
 107         movl    %edi,%ecx
 108         negl    %ecx
 109         andl    $15,%ecx                // mask down to #bytes to 16-byte align
 110         jz      LDestAligned            // already aligned
 111         subl    %ecx,%edx               // decrement length
 112 0:                                      // loop storing bytes to align the ptr
 113         movb    %al,(%edi)              // pack in a byte
 114         inc     %edi
 115         dec     %ecx
 116         jnz     0b
 117
 118 // Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
 119 //      %edx = length
 120 //      %edi = ptr
 121 //      %eax = zero
 122
 123 LDestAligned:
 124         movl    %edx,%ecx
 125         andl    $63,%edx                // mask down to residual length (0..63)
 126         andl    $-64,%ecx               // get #bytes we will zero in this loop
 127         pxor    %xmm0,%xmm0             // zero an SSE register
 128         addl    %ecx,%edi               // increment ptr by length to move
 129         cmpl    $(kVeryLong),%ecx       // long enough to justify non-temporal stores?
 130         jae     LVeryLong               // yes
 131         negl    %ecx                    // negate length to move
 132         jmp     1f
 133
 134 // Loop over 64-byte chunks, storing into cache.
 135
 136         .align  4,0x90                  // keep inner loops 16-byte aligned
 137 1:
 138         movdqa  %xmm0,(%edi,%ecx)
 139         movdqa  %xmm0,16(%edi,%ecx)
 140         movdqa  %xmm0,32(%edi,%ecx)
 141         movdqa  %xmm0,48(%edi,%ecx)
 142         addl    $64,%ecx
 143         jne     1b
 144
 145         jmp     Lshort
 146
 147 // Very long operands: use non-temporal stores to bypass cache.
 148
 149 LVeryLong:
 150         negl    %ecx                    // negate length to move
 151         jmp     1f
 152
 153         .align  4,0x90                  // keep inner loops 16-byte aligned
 154 1:
 155         movntdq %xmm0,(%edi,%ecx)
 156         movntdq %xmm0,16(%edi,%ecx)
 157         movntdq %xmm0,32(%edi,%ecx)
 158         movntdq %xmm0,48(%edi,%ecx)
 159         addl    $64,%ecx
 160         jne     1b
 161
 162         sfence                          // required by non-temporal stores
 163         jmp     Lshort
 164
 165
 166         COMMPAGE_DESCRIPTOR(bzero_sse3,_COMM_PAGE_BZERO,kHasSSE2,0)