x86_64/string/bzero_sse2.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include "platfunc.h"
  31
  32 /*
  33  * Bzero, tuned for Pentium-M class processors with SSE2
  34  * and 64-byte cache lines.  This is the 64-bit version.
  35  *
  36  * This routine is also used for memset(p,0,n), which is a common case
  37  * since gcc sometimes silently maps bzero() into memset().  As a result,
  38  * we always load the original ptr into %eax before returning.
  39  */
  40
  41 #define kShort          80              // too short to bother with SSE (must be >=80)
  42 #define kVeryLong       (1024*1024)
  43
  44 // void bzero(void *b, size_t len);
  45
  46 PLATFUNC_FUNCTION_START_GENERIC(bzero, sse2, 64, 5)
  47         pushq   %rbp                    // set up a frame for backtraces
  48         movq    %rsp,%rbp
  49         xorl    %eax,%eax               // set fill data to 0
  50         movq    %rdi,%r11               // save original ptr as return value
  51         cmpq    $(kShort),%rsi          // long enough for SSE?
  52         jg      LNotShort               // yes
  53
  54 // Here for short operands or the end of long ones.
  55 //      %esi = length (<= kShort)
  56 //      %rdi = ptr
  57 //      %eax = zero
  58
  59 Lshort:
  60         cmpl    $16,%esi                // long enough to word align?
  61         jge     3f                      // yes
  62         test    %esi,%esi               // length==0?
  63         jz      6f
  64 1:
  65         movb    %al,(%rdi)              // zero a byte
  66         incq    %rdi
  67         decl    %esi
  68         jnz     1b
  69         jmp     6f
  70 2:
  71         movb    %al,(%rdi)              // zero a byte
  72         incq    %rdi
  73         decl    %esi
  74 3:
  75         testl   $3,%edi                 // is ptr doubleword aligned?
  76         jnz     2b                      // no
  77         movl    %esi,%ecx               // copy length
  78         shrl    $2,%esi                 // #doublewords to store
  79 4:
  80         movl    %eax,(%rdi)             // zero an aligned doubleword
  81         addq    $4,%rdi
  82         decl    %esi
  83         jnz     4b
  84         andl    $3,%ecx                 // mask down to #bytes at end (0..3)
  85         jz      6f                      // none
  86 5:
  87         movb    %al,(%rdi)              // zero a byte
  88         incq    %rdi
  89         decl    %ecx
  90         jnz     5b
  91 6:
  92         movq    %r11,%rax               // set return value in case this was a call of memset()
  93         popq    %rbp
  94         ret
  95
  96
  97 // We will be using SSE, so align ptr.
  98 //      %rsi = length (> kShort)
  99 //      %rdi = ptr
 100 //      %eax = zero
 101
 102 LNotShort:
 103         movl    %edi,%ecx               // get #bytes to 16-byte align ptr
 104         negl    %ecx
 105         andl    $15,%ecx
 106         jz      LDestAligned            // already aligned
 107         subq    %rcx,%rsi               // decrement length
 108 0:                                      // loop storing bytes to align the ptr
 109         movb    %al,(%rdi)              // pack in a byte
 110         incq    %rdi
 111         decl    %ecx
 112         jnz     0b
 113
 114 // Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
 115 //      %rsi = length (> (kShort-15))
 116 //      %rdi = ptr (aligned)
 117 //      %eax = zero
 118
 119 LDestAligned:
 120         movq    %rsi,%rcx
 121         andl    $63,%esi                // mask down to residual length (0..63)
 122         andq    $-64,%rcx               // get #bytes we will zero in this loop
 123         pxor    %xmm0,%xmm0             // zero an SSE register
 124         addq    %rcx,%rdi               // increment ptr by length to move
 125         cmpq    $(kVeryLong),%rcx       // long enough to justify non-temporal stores?
 126         jae     LVeryLong               // yes
 127         negq    %rcx                    // negate length to move
 128         jmp     1f
 129
 130 // Loop over 64-byte chunks, storing into cache.
 131
 132         .align  4,0x90                  // keep inner loops 16-byte aligned
 133 1:
 134         movdqa  %xmm0,(%rdi,%rcx)
 135         movdqa  %xmm0,16(%rdi,%rcx)
 136         movdqa  %xmm0,32(%rdi,%rcx)
 137         movdqa  %xmm0,48(%rdi,%rcx)
 138         addq    $64,%rcx
 139         jne     1b
 140
 141         jmp     Lshort
 142
 143 // Very long operands: use non-temporal stores to bypass cache.
 144
 145 LVeryLong:
 146         negq    %rcx                    // negate length to move
 147         jmp     1f
 148
 149         .align  4,0x90                  // keep inner loops 16-byte aligned
 150 1:
 151         movntdq %xmm0,(%rdi,%rcx)
 152         movntdq %xmm0,16(%rdi,%rcx)
 153         movntdq %xmm0,32(%rdi,%rcx)
 154         movntdq %xmm0,48(%rdi,%rcx)
 155         addq    $64,%rcx
 156         jne     1b
 157
 158         sfence                          // required by non-temporal stores
 159         jmp     Lshort
 160
 161 PLATFUNC_DESCRIPTOR(bzero,sse2,kHasSSE2,kHasSSE4_2)