osfmk/i386/commpage/bzero_sse42_64.s

   1 /*
   2  * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /*
  33  * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem.
  34  * We don't actually use SSE4.2, but rather use it to identify Nehalem.
  35  * This is the 64-bit version.
  36  *
  37  * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS.
  38  *
  39  * This routine is also used for memset(p,0,n), which is a common case
  40  * since gcc sometimes silently maps bzero() into memset().  As a result,
  41  * we always load the original ptr into %eax before returning.
  42  */
  43
  44 #define kShort          80              // too short to bother with SSE (must be >=80)
  45
  46
  47         .text
  48         .code64
  49         .align  5, 0x90
  50 Lbzero_sse42_64:                         // void        bzero(void *b, size_t len);
  51         pushq   %rbp                    // set up a frame for backtraces
  52         movq    %rsp,%rbp
  53         xorl    %eax,%eax               // set fill data to 0
  54         movq    %rdi,%r11               // save original ptr as return value
  55         cmpq    $(kShort),%rsi          // long enough for SSE?
  56         jg      LNotShort               // yes
  57
  58 // Here for short operands or the end of long ones.
  59 //      %esi = length (<= kShort)
  60 //      %rdi = ptr
  61 //      %eax = zero
  62
  63 Lshort:
  64         cmpl    $12,%esi                // long enough to word align?
  65         jge     3f                      // yes
  66         test    %esi,%esi               // length==0?
  67         jz      6f
  68 1:
  69         movb    %al,(%rdi)              // zero a byte
  70         incq    %rdi
  71         decl    %esi
  72         jnz     1b
  73         jmp     6f
  74 2:
  75         movb    %al,(%rdi)              // zero a byte
  76         incq    %rdi
  77         decl    %esi
  78 3:
  79         testl   $3,%edi                 // is ptr doubleword aligned?
  80         jnz     2b                      // no
  81         movl    %esi,%ecx               // copy length
  82         shrl    $2,%esi                 // #doublewords to store
  83 4:
  84         movl    %eax,(%rdi)             // zero an aligned doubleword
  85         addq    $4,%rdi
  86         decl    %esi
  87         jnz     4b
  88         andl    $3,%ecx                 // mask down to #bytes at end (0..3)
  89         jz      6f                      // none
  90 5:
  91         movb    %al,(%rdi)              // zero a byte
  92         incq    %rdi
  93         decl    %ecx
  94         jnz     5b
  95 6:
  96         movq    %r11,%rax               // set return value in case this was a call of memset()
  97         popq    %rbp
  98         ret
  99
 100
 101 // We will be using SSE, so align ptr.
 102 //      %rsi = length (> kShort)
 103 //      %rdi = ptr
 104 //      %eax = zero
 105
 106 LNotShort:
 107         testl   $3,%edi                 // 4-byte aligned?
 108         jz      2f                      // yes
 109         movb    %al,(%rdi)              // zero another byte
 110         incq    %rdi
 111         decq    %rsi
 112         jmp     LNotShort
 113 1:                                      // zero doublewords until 16-byte aligned
 114         movl    %eax,(%rdi)
 115         addq    $4,%rdi
 116         subq    $4,%rsi
 117 2:
 118         testl   $15,%edi                // 16-byte aligned?
 119         jnz     1b                      // no
 120
 121 // Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
 122 //      %rsi = length (> (kShort-15))
 123 //      %rdi = ptr (aligned)
 124 //      %eax = zero
 125
 126 LDestAligned:
 127         movq    %rsi,%rcx
 128         andl    $63,%esi                // mask down to residual length (0..63)
 129         andq    $-64,%rcx               // get #bytes we will zero in this loop
 130         pxor    %xmm0,%xmm0             // zero an SSE register
 131         addq    %rcx,%rdi               // increment ptr by length to move
 132         negq    %rcx                    // negate length to move
 133         jmp     1f
 134
 135 // Loop over 64-byte chunks, storing into cache.
 136
 137         .align  4,0x90                  // keep inner loops 16-byte aligned
 138 1:
 139         movdqa  %xmm0,(%rdi,%rcx)
 140         movdqa  %xmm0,16(%rdi,%rcx)
 141         movdqa  %xmm0,32(%rdi,%rcx)
 142         movdqa  %xmm0,48(%rdi,%rcx)
 143         addq    $64,%rcx
 144         jne     1b
 145
 146         jmp     Lshort
 147
 148
 149         COMMPAGE_DESCRIPTOR(bzero_sse42_64,_COMM_PAGE_BZERO,kHasSSE4_2,0)