osfmk/i386/commpage/bzero_sse2.s

   1 /*
   2  * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /*
  33  * Bzero, tuned for Pentium-M class processors with SSE2
  34  * and 64-byte cache lines.
  35  *
  36  * This routine is also used for memset(p,0,n), which is a common case
  37  * since gcc sometimes silently maps bzero() into memset().  As a result,
  38  * we always load the original ptr into %eax before returning.
  39  */
  40
  41 #define kShort          80              // too short to bother with SSE (must be >=80)
  42 #define kVeryLong       (1024*1024)
  43
  44
  45         .text
  46         .align  5, 0x90
  47 Lbzero_sse2:                            // void bzero(void *b, size_t len);
  48         pushl   %ebp                    // set up a frame for backtraces
  49         movl    %esp,%ebp
  50         pushl   %edi
  51         movl    8(%ebp),%edi            // get ptr
  52         movl    12(%ebp),%edx           // get length
  53
  54         xorl    %eax,%eax               // set fill data to 0
  55         cmpl    $(kShort),%edx          // long enough for SSE?
  56         jg      LNotShort               // yes
  57
  58 // Here for short operands or the end of long ones.
  59 //      %edx = length
  60 //      %edi = ptr
  61 //      %eax = zero
  62
  63 Lshort:
  64         cmpl    $16,%edx                // long enough to word align?
  65         jge     3f                      // yes
  66         test    %edx,%edx               // length==0?
  67         jz      6f
  68 1:
  69         movb    %al,(%edi)              // zero a byte
  70         inc     %edi
  71         dec     %edx
  72         jnz     1b
  73         jmp     6f
  74 2:
  75         movb    %al,(%edi)              // zero a byte
  76         inc     %edi
  77         dec     %edx
  78 3:
  79         test    $3,%edi                 // is ptr doubleword aligned?
  80         jnz     2b                      // no
  81         movl    %edx,%ecx               // copy length
  82         shrl    $2,%edx                 // #doublewords to store
  83 4:
  84         movl    %eax,(%edi)             // zero an aligned doubleword
  85         addl    $4,%edi
  86         dec     %edx
  87         jnz     4b
  88         andl    $3,%ecx                 // mask down to #bytes at end (0..3)
  89         jz      6f                      // none
  90 5:
  91         movb    %al,(%edi)              // zero a byte
  92         inc     %edi
  93         dec     %ecx
  94         jnz     5b
  95 6:
  96         movl    8(%ebp),%eax            // get return value in case this was a call of memset()
  97         popl    %edi
  98         popl    %ebp
  99         ret
 100
 101
 102 // We will be using SSE, so align ptr.
 103
 104 LNotShort:
 105         movl    %edi,%ecx
 106         negl    %ecx
 107         andl    $15,%ecx                // mask down to #bytes to 16-byte align
 108         jz      LDestAligned            // already aligned
 109         subl    %ecx,%edx               // decrement length
 110 0:                                      // loop storing bytes to align the ptr
 111         movb    %al,(%edi)              // pack in a byte
 112         inc     %edi
 113         dec     %ecx
 114         jnz     0b
 115
 116 // Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
 117 //      %edx = length
 118 //      %edi = ptr
 119 //      %eax = zero
 120
 121 LDestAligned:
 122         movl    %edx,%ecx
 123         andl    $63,%edx                // mask down to residual length (0..63)
 124         andl    $-64,%ecx               // get #bytes we will zero in this loop
 125         pxor    %xmm0,%xmm0             // zero an SSE register
 126         addl    %ecx,%edi               // increment ptr by length to move
 127         cmpl    $(kVeryLong),%ecx       // long enough to justify non-temporal stores?
 128         jae     LVeryLong               // yes
 129         negl    %ecx                    // negate length to move
 130         jmp     1f
 131
 132 // Loop over 64-byte chunks, storing into cache.
 133
 134         .align  4,0x90                  // keep inner loops 16-byte aligned
 135 1:
 136         movdqa  %xmm0,(%edi,%ecx)
 137         movdqa  %xmm0,16(%edi,%ecx)
 138         movdqa  %xmm0,32(%edi,%ecx)
 139         movdqa  %xmm0,48(%edi,%ecx)
 140         addl    $64,%ecx
 141         jne     1b
 142
 143         jmp     Lshort
 144
 145 // Very long operands: use non-temporal stores to bypass cache.
 146
 147 LVeryLong:
 148         negl    %ecx                    // negate length to move
 149         jmp     1f
 150
 151         .align  4,0x90                  // keep inner loops 16-byte aligned
 152 1:
 153         movntdq %xmm0,(%edi,%ecx)
 154         movntdq %xmm0,16(%edi,%ecx)
 155         movntdq %xmm0,32(%edi,%ecx)
 156         movntdq %xmm0,48(%edi,%ecx)
 157         addl    $64,%ecx
 158         jne     1b
 159
 160         sfence                          // required by non-temporal stores
 161         jmp     Lshort
 162
 163
 164         COMMPAGE_DESCRIPTOR(bzero_sse2,_COMM_PAGE_BZERO,kHasSSE2,0)