osfmk/i386/commpage/bcopy_sse42_64.s

   1 /*
   2  * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for Nehalem.  This is the 64-bit version.
  34  *
  35  * The following #defines are tightly coupled to the u-architecture:
  36  */
  37
  38 #define kShort  80                      // too short to bother with SSE (must be >=80)
  39
  40
  41 // void bcopy(const void *src, void *dst, size_t len);
  42
  43 COMMPAGE_FUNCTION_START(bcopy_sse42_64, 64, 5)
  44         pushq   %rbp                    // set up a frame for backtraces
  45         movq    %rsp,%rbp
  46         movq    %rsi,%rax               // copy dest ptr
  47         movq    %rdi,%rsi               // xchange source and dest ptrs
  48         movq    %rax,%rdi
  49         subq    %rsi,%rax               // (dest - source)
  50         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  51         jb      LReverseIsland
  52         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  53         jbe     LShort                  // no
  54         jmp     LNotShort
  55
  56 //
  57 // void *memcpy(void *dst, const void *src, size_t len);
  58 // void *memmove(void *dst, const void *src, size_t len);
  59 //
  60 // NB: These need to be 32 bytes from bcopy():
  61 //
  62
  63         .align  5, 0x90
  64 Lmemcpy:                                // void *memcpy(void *dst, const void *src, size_t len)
  65 Lmemmove:                               // void *memmove(void *dst, const void *src, size_t len)
  66         pushq   %rbp                    // set up a frame for backtraces
  67         movq    %rsp,%rbp
  68         movq    %rdi,%r11               // save return value here
  69         movq    %rdi,%rax
  70         subq    %rsi,%rax               // (dest - source)
  71         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  72         jb      LReverseIsland
  73         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  74         ja      LNotShort               // yes
  75
  76 // Handle short forward copies.  As the most common case, this is the fall-through path.
  77 //      rdx = length (<= kShort)
  78 //      rsi = source ptr
  79 //      rdi = dest ptr
  80
  81 LShort:
  82         movl    %edx,%ecx               // copy length using 32-bit operation
  83         shrl    $2,%ecx                 // get #doublewords
  84         jz      3f
  85 2:                                      // loop copying doublewords
  86         movl    (%rsi),%eax
  87         addq    $4,%rsi
  88         movl    %eax,(%rdi)
  89         addq    $4,%rdi
  90         decl    %ecx
  91         jnz     2b
  92 3:                                      // handle leftover bytes (0..3) in last word
  93         andl    $3,%edx                 // any leftover bytes?
  94         jz      5f
  95 4:                                      // loop copying bytes
  96         movb    (%rsi),%al
  97         incq    %rsi
  98         movb    %al,(%rdi)
  99         incq    %rdi
 100         decl    %edx
 101         jnz     4b
 102 5:
 103         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 104         popq    %rbp
 105         ret
 106
 107
 108 LReverseIsland:                         // keep the "jb" above a short branch...
 109         jmp     LReverse                // ...because reverse moves are uncommon
 110
 111
 112 // Handle forward moves that are long enough to justify use of SSE.
 113 // First, 16-byte align the destination.
 114 //      rdx = length (> kShort)
 115 //      rsi = source ptr
 116 //      rdi = dest ptr
 117
 118 LNotShort:
 119         movl    %edi,%ecx               // copy low half of destination ptr
 120         negl    %ecx
 121         andl    $15,%ecx                // get #bytes to align destination
 122         jz      LDestAligned            // already aligned
 123         subl    %ecx,%edx               // decrement length
 124 1:                                      // loop copying 1..15 bytes
 125         movb    (%rsi),%al
 126         inc     %rsi
 127         movb    %al,(%rdi)
 128         inc     %rdi
 129         dec     %ecx
 130         jnz     1b
 131
 132
 133 // Destination is now aligned.  Nehalem does a great job with unaligned SSE loads,
 134 // so we use MOVDQU rather than aligned loads and shifts.  Since kShort>=80, we
 135 // know there is at least one 64-byte chunk to move.
 136 // When we enter the copy loops, the following registers are set up:
 137 //      rdx = residual length (0..63)
 138 //      rcx = -(length to move), a multiple of 64 less than 2GB
 139 //      rsi = ptr to 1st source byte not to move (unaligned)
 140 //      rdi = ptr to 1st dest byte not to move (aligned)
 141
 142 LDestAligned:
 143         movq    %rdx,%rcx               // copy length
 144         andl    $63,%edx                // get remaining bytes for LShort
 145         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 146         addq    %rcx,%rsi               // point to 1st byte not copied
 147         addq    %rcx,%rdi
 148         negq    %rcx                    // now generate offset to 1st byte to be copied
 149         testl   $15,%esi                // source also aligned?
 150         jnz     LUnalignedLoop
 151         jmp     LAlignedLoop
 152
 153
 154 // Forward loop for aligned operands.
 155
 156         .align  4,0x90                  // 16-byte align inner loops
 157 LAlignedLoop:                           // loop over 64-byte chunks
 158         movdqa  (%rsi,%rcx),%xmm0
 159         movdqa  16(%rsi,%rcx),%xmm1
 160         movdqa  32(%rsi,%rcx),%xmm2
 161         movdqa  48(%rsi,%rcx),%xmm3
 162
 163         movdqa  %xmm0,(%rdi,%rcx)
 164         movdqa  %xmm1,16(%rdi,%rcx)
 165         movdqa  %xmm2,32(%rdi,%rcx)
 166         movdqa  %xmm3,48(%rdi,%rcx)
 167
 168         addq    $64,%rcx
 169         jnz     LAlignedLoop
 170
 171         jmp     LShort                  // copy remaining 0..63 bytes and done
 172
 173
 174 // Forward loop for unaligned operands.
 175
 176         .align  4,0x90                  // 16-byte align inner loops
 177 LUnalignedLoop:                         // loop over 64-byte chunks
 178         movdqu  (%rsi,%rcx),%xmm0
 179         movdqu  16(%rsi,%rcx),%xmm1
 180         movdqu  32(%rsi,%rcx),%xmm2
 181         movdqu  48(%rsi,%rcx),%xmm3
 182
 183         movdqa  %xmm0,(%rdi,%rcx)
 184         movdqa  %xmm1,16(%rdi,%rcx)
 185         movdqa  %xmm2,32(%rdi,%rcx)
 186         movdqa  %xmm3,48(%rdi,%rcx)
 187
 188         addq    $64,%rcx
 189         jnz     LUnalignedLoop
 190
 191         jmp     LShort                  // copy remaining 0..63 bytes and done
 192
 193
 194 // Reverse moves.  These are only used with destructive overlap.
 195 //      rdx = length
 196 //      rsi = source ptr
 197 //      rdi = dest ptr
 198
 199 LReverse:
 200         addq    %rdx,%rsi               // point to end of strings
 201         addq    %rdx,%rdi
 202         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
 203         ja      LReverseNotShort        // yes
 204
 205 // Handle reverse short copies.
 206 //      edx = length (<= kShort)
 207 //      rsi = one byte past end of source
 208 //      rdi = one byte past end of dest
 209
 210 LReverseShort:
 211         movl    %edx,%ecx               // copy length
 212         shrl    $3,%ecx                 // #quadwords
 213         jz      3f
 214 1:
 215         subq    $8,%rsi
 216         movq    (%rsi),%rax
 217         subq    $8,%rdi
 218         movq    %rax,(%rdi)
 219         decl    %ecx
 220         jnz     1b
 221 3:
 222         andl    $7,%edx                 // bytes?
 223         jz      5f
 224 4:
 225         decq    %rsi
 226         movb    (%rsi),%al
 227         decq    %rdi
 228         movb    %al,(%rdi)
 229         decl    %edx
 230         jnz     4b
 231 5:
 232         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 233         popq    %rbp
 234         ret
 235
 236 // Handle a reverse move long enough to justify using SSE.
 237 //      rdx = length (> kShort)
 238 //      rsi = one byte past end of source
 239 //      rdi = one byte past end of dest
 240
 241 LReverseNotShort:
 242         movl    %edi,%ecx               // copy destination
 243         andl    $15,%ecx                // get #bytes to align destination
 244         jz      LReverseDestAligned     // already aligned
 245         subq    %rcx,%rdx               // adjust length
 246 1:                                      // loop copying 1..15 bytes
 247         decq    %rsi
 248         movb    (%rsi),%al
 249         decq    %rdi
 250         movb    %al,(%rdi)
 251         decl    %ecx
 252         jnz     1b
 253
 254 // Destination is now aligned.  Prepare for reverse loops.
 255
 256 LReverseDestAligned:
 257         movq    %rdx,%rcx               // copy length
 258         andl    $63,%edx                // get remaining bytes for LReverseShort
 259         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 260         subq    %rcx,%rsi               // point to endpoint of copy
 261         subq    %rcx,%rdi
 262         testl   $15,%esi                // is source aligned too?
 263         jnz     LReverseUnalignedLoop   // no
 264
 265 LReverseAlignedLoop:                    // loop over 64-byte chunks
 266         movdqa  -16(%rsi,%rcx),%xmm0
 267         movdqa  -32(%rsi,%rcx),%xmm1
 268         movdqa  -48(%rsi,%rcx),%xmm2
 269         movdqa  -64(%rsi,%rcx),%xmm3
 270
 271         movdqa  %xmm0,-16(%rdi,%rcx)
 272         movdqa  %xmm1,-32(%rdi,%rcx)
 273         movdqa  %xmm2,-48(%rdi,%rcx)
 274         movdqa  %xmm3,-64(%rdi,%rcx)
 275
 276         subq    $64,%rcx
 277         jne     LReverseAlignedLoop
 278
 279         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 280
 281
 282 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 283
 284 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 285         movdqu  -16(%rsi,%rcx),%xmm0
 286         movdqu  -32(%rsi,%rcx),%xmm1
 287         movdqu  -48(%rsi,%rcx),%xmm2
 288         movdqu  -64(%rsi,%rcx),%xmm3
 289
 290         movdqa  %xmm0,-16(%rdi,%rcx)
 291         movdqa  %xmm1,-32(%rdi,%rcx)
 292         movdqa  %xmm2,-48(%rdi,%rcx)
 293         movdqa  %xmm3,-64(%rdi,%rcx)
 294
 295         subq    $64,%rcx
 296         jne     LReverseUnalignedLoop
 297
 298         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 299
 300
 301         COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0)