x86_64/string/bcopy_sse42.s

   1 /*
   2  * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include "platfunc.h"
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for Nehalem.  This is the 64-bit version.
  34  *
  35  * The following #defines are tightly coupled to the u-architecture:
  36  */
  37
  38 #define kShort  80                      // too short to bother with SSE (must be >=80)
  39
  40
  41 // void bcopy(const void *src, void *dst, size_t len);
  42
  43 PLATFUNC_FUNCTION_START(bcopy, sse42, 64, 5)
  44         pushq   %rbp                    // set up a frame for backtraces
  45         movq    %rsp,%rbp
  46         movq    %rsi,%rax               // copy dest ptr
  47         movq    %rdi,%rsi               // xchange source and dest ptrs
  48         movq    %rax,%rdi
  49         subq    %rsi,%rax               // (dest - source)
  50         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  51         jb      LReverseIsland
  52         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  53         jbe     LShort                  // no
  54         jmp     LNotShort
  55
  56 //
  57 // void *memcpy(void *dst, const void *src, size_t len);
  58 // void *memmove(void *dst, const void *src, size_t len);
  59 //
  60
  61 PLATFUNC_FUNCTION_START(memcpy, sse42, 64, 0)           // void *memcpy(void *dst, const void *src, size_t len)
  62 PLATFUNC_FUNCTION_START(memmove, sse42, 64, 0)  // void *memmove(void *dst, const void *src, size_t len)
  63         pushq   %rbp                    // set up a frame for backtraces
  64         movq    %rsp,%rbp
  65         movq    %rdi,%r11               // save return value here
  66         movq    %rdi,%rax
  67         subq    %rsi,%rax               // (dest - source)
  68         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  69         jb      LReverseIsland
  70         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  71         ja      LNotShort               // yes
  72
  73 // Handle short forward copies.  As the most common case, this is the fall-through path.
  74 //      rdx = length (<= kShort)
  75 //      rsi = source ptr
  76 //      rdi = dest ptr
  77
  78 LShort:
  79         movl    %edx,%ecx               // copy length using 32-bit operation
  80         shrl    $2,%ecx                 // get #doublewords
  81         jz      3f
  82 2:                                      // loop copying doublewords
  83         movl    (%rsi),%eax
  84         addq    $4,%rsi
  85         movl    %eax,(%rdi)
  86         addq    $4,%rdi
  87         decl    %ecx
  88         jnz     2b
  89 3:                                      // handle leftover bytes (0..3) in last word
  90         andl    $3,%edx                 // any leftover bytes?
  91         jz      5f
  92 4:                                      // loop copying bytes
  93         movb    (%rsi),%al
  94         incq    %rsi
  95         movb    %al,(%rdi)
  96         incq    %rdi
  97         decl    %edx
  98         jnz     4b
  99 5:
 100         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 101         popq    %rbp
 102         ret
 103
 104
 105 LReverseIsland:                         // keep the "jb" above a short branch...
 106         jmp     LReverse                // ...because reverse moves are uncommon
 107
 108
 109 // Handle forward moves that are long enough to justify use of SSE.
 110 // First, 16-byte align the destination.
 111 //      rdx = length (> kShort)
 112 //      rsi = source ptr
 113 //      rdi = dest ptr
 114
 115 LNotShort:
 116         movl    %edi,%ecx               // copy low half of destination ptr
 117         negl    %ecx
 118         andl    $15,%ecx                // get #bytes to align destination
 119         jz      LDestAligned            // already aligned
 120         subl    %ecx,%edx               // decrement length
 121 1:                                      // loop copying 1..15 bytes
 122         movb    (%rsi),%al
 123         inc     %rsi
 124         movb    %al,(%rdi)
 125         inc     %rdi
 126         dec     %ecx
 127         jnz     1b
 128
 129
 130 // Destination is now aligned.  Nehalem does a great job with unaligned SSE loads,
 131 // so we use MOVDQU rather than aligned loads and shifts.  Since kShort>=80, we
 132 // know there is at least one 64-byte chunk to move.
 133 // When we enter the copy loops, the following registers are set up:
 134 //      rdx = residual length (0..63)
 135 //      rcx = -(length to move), a multiple of 64 less than 2GB
 136 //      rsi = ptr to 1st source byte not to move (unaligned)
 137 //      rdi = ptr to 1st dest byte not to move (aligned)
 138
 139 LDestAligned:
 140         movq    %rdx,%rcx               // copy length
 141         andl    $63,%edx                // get remaining bytes for LShort
 142         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 143         addq    %rcx,%rsi               // point to 1st byte not copied
 144         addq    %rcx,%rdi
 145         negq    %rcx                    // now generate offset to 1st byte to be copied
 146         testl   $15,%esi                // source also aligned?
 147         jnz     LUnalignedLoop
 148         jmp     LAlignedLoop
 149
 150
 151 // Forward loop for aligned operands.
 152
 153         .align  4,0x90                  // 16-byte align inner loops
 154 LAlignedLoop:                           // loop over 64-byte chunks
 155         movdqa  (%rsi,%rcx),%xmm0
 156         movdqa  16(%rsi,%rcx),%xmm1
 157         movdqa  32(%rsi,%rcx),%xmm2
 158         movdqa  48(%rsi,%rcx),%xmm3
 159
 160         movdqa  %xmm0,(%rdi,%rcx)
 161         movdqa  %xmm1,16(%rdi,%rcx)
 162         movdqa  %xmm2,32(%rdi,%rcx)
 163         movdqa  %xmm3,48(%rdi,%rcx)
 164
 165         addq    $64,%rcx
 166         jnz     LAlignedLoop
 167
 168         jmp     LShort                  // copy remaining 0..63 bytes and done
 169
 170
 171 // Forward loop for unaligned operands.
 172
 173         .align  4,0x90                  // 16-byte align inner loops
 174 LUnalignedLoop:                         // loop over 64-byte chunks
 175         movdqu  (%rsi,%rcx),%xmm0
 176         movdqu  16(%rsi,%rcx),%xmm1
 177         movdqu  32(%rsi,%rcx),%xmm2
 178         movdqu  48(%rsi,%rcx),%xmm3
 179
 180         movdqa  %xmm0,(%rdi,%rcx)
 181         movdqa  %xmm1,16(%rdi,%rcx)
 182         movdqa  %xmm2,32(%rdi,%rcx)
 183         movdqa  %xmm3,48(%rdi,%rcx)
 184
 185         addq    $64,%rcx
 186         jnz     LUnalignedLoop
 187
 188         jmp     LShort                  // copy remaining 0..63 bytes and done
 189
 190
 191 // Reverse moves.  These are only used with destructive overlap.
 192 //      rdx = length
 193 //      rsi = source ptr
 194 //      rdi = dest ptr
 195
 196 LReverse:
 197         addq    %rdx,%rsi               // point to end of strings
 198         addq    %rdx,%rdi
 199         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
 200         ja      LReverseNotShort        // yes
 201
 202 // Handle reverse short copies.
 203 //      edx = length (<= kShort)
 204 //      rsi = one byte past end of source
 205 //      rdi = one byte past end of dest
 206
 207 LReverseShort:
 208         movl    %edx,%ecx               // copy length
 209         shrl    $3,%ecx                 // #quadwords
 210         jz      3f
 211 1:
 212         subq    $8,%rsi
 213         movq    (%rsi),%rax
 214         subq    $8,%rdi
 215         movq    %rax,(%rdi)
 216         decl    %ecx
 217         jnz     1b
 218 3:
 219         andl    $7,%edx                 // bytes?
 220         jz      5f
 221 4:
 222         decq    %rsi
 223         movb    (%rsi),%al
 224         decq    %rdi
 225         movb    %al,(%rdi)
 226         decl    %edx
 227         jnz     4b
 228 5:
 229         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 230         popq    %rbp
 231         ret
 232
 233 // Handle a reverse move long enough to justify using SSE.
 234 //      rdx = length (> kShort)
 235 //      rsi = one byte past end of source
 236 //      rdi = one byte past end of dest
 237
 238 LReverseNotShort:
 239         movl    %edi,%ecx               // copy destination
 240         andl    $15,%ecx                // get #bytes to align destination
 241         jz      LReverseDestAligned     // already aligned
 242         subq    %rcx,%rdx               // adjust length
 243 1:                                      // loop copying 1..15 bytes
 244         decq    %rsi
 245         movb    (%rsi),%al
 246         decq    %rdi
 247         movb    %al,(%rdi)
 248         decl    %ecx
 249         jnz     1b
 250
 251 // Destination is now aligned.  Prepare for reverse loops.
 252
 253 LReverseDestAligned:
 254         movq    %rdx,%rcx               // copy length
 255         andl    $63,%edx                // get remaining bytes for LReverseShort
 256         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 257         subq    %rcx,%rsi               // point to endpoint of copy
 258         subq    %rcx,%rdi
 259         testl   $15,%esi                // is source aligned too?
 260         jnz     LReverseUnalignedLoop   // no
 261
 262 LReverseAlignedLoop:                    // loop over 64-byte chunks
 263         movdqa  -16(%rsi,%rcx),%xmm0
 264         movdqa  -32(%rsi,%rcx),%xmm1
 265         movdqa  -48(%rsi,%rcx),%xmm2
 266         movdqa  -64(%rsi,%rcx),%xmm3
 267
 268         movdqa  %xmm0,-16(%rdi,%rcx)
 269         movdqa  %xmm1,-32(%rdi,%rcx)
 270         movdqa  %xmm2,-48(%rdi,%rcx)
 271         movdqa  %xmm3,-64(%rdi,%rcx)
 272
 273         subq    $64,%rcx
 274         jne     LReverseAlignedLoop
 275
 276         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 277
 278
 279 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 280
 281 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 282         movdqu  -16(%rsi,%rcx),%xmm0
 283         movdqu  -32(%rsi,%rcx),%xmm1
 284         movdqu  -48(%rsi,%rcx),%xmm2
 285         movdqu  -64(%rsi,%rcx),%xmm3
 286
 287         movdqa  %xmm0,-16(%rdi,%rcx)
 288         movdqa  %xmm1,-32(%rdi,%rcx)
 289         movdqa  %xmm2,-48(%rdi,%rcx)
 290         movdqa  %xmm3,-64(%rdi,%rcx)
 291
 292         subq    $64,%rcx
 293         jne     LReverseUnalignedLoop
 294
 295         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 296
 297
 298 PLATFUNC_DESCRIPTOR(bcopy,sse42,kHasSSE4_2,0)
 299 PLATFUNC_DESCRIPTOR(memcpy,sse42,kHasSSE4_2,0)
 300 PLATFUNC_DESCRIPTOR(memmove,sse42,kHasSSE4_2,0)