osfmk/i386/commpage/bcopy_sse42_64.s

   1 /*
   2  * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for Nehalem.  This is the 64-bit version.
  34  *
  35  * The following #defines are tightly coupled to the u-architecture:
  36  */
  37
  38 #define kShort  80                      // too short to bother with SSE (must be >=80)
  39
  40
  41 // void bcopy(const void *src, void *dst, size_t len);
  42
  43         .text
  44         .code64
  45         .align 5, 0x90
  46 Lbcopy_sse42_64:                                // void bcopy(const void *src, void *dst, size_t len)
  47         pushq   %rbp                    // set up a frame for backtraces
  48         movq    %rsp,%rbp
  49         movq    %rsi,%rax               // copy dest ptr
  50         movq    %rdi,%rsi               // xchange source and dest ptrs
  51         movq    %rax,%rdi
  52         subq    %rsi,%rax               // (dest - source)
  53         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  54         jb      LReverseIsland
  55         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  56         jbe     LShort                  // no
  57         jmp     LNotShort
  58
  59 //
  60 // void *memcpy(void *dst, const void *src, size_t len);
  61 // void *memmove(void *dst, const void *src, size_t len);
  62 //
  63 // NB: These need to be 32 bytes from bcopy():
  64 //
  65
  66         .align  5, 0x90
  67 Lmemcpy:                                // void *memcpy(void *dst, const void *src, size_t len)
  68 Lmemmove:                               // void *memmove(void *dst, const void *src, size_t len)
  69         pushq   %rbp                    // set up a frame for backtraces
  70         movq    %rsp,%rbp
  71         movq    %rdi,%r11               // save return value here
  72         movq    %rdi,%rax
  73         subq    %rsi,%rax               // (dest - source)
  74         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  75         jb      LReverseIsland
  76         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  77         ja      LNotShort               // yes
  78
  79 // Handle short forward copies.  As the most common case, this is the fall-through path.
  80 //      rdx = length (<= kShort)
  81 //      rsi = source ptr
  82 //      rdi = dest ptr
  83
  84 LShort:
  85         movl    %edx,%ecx               // copy length using 32-bit operation
  86         shrl    $2,%ecx                 // get #doublewords
  87         jz      3f
  88 2:                                      // loop copying doublewords
  89         movl    (%rsi),%eax
  90         addq    $4,%rsi
  91         movl    %eax,(%rdi)
  92         addq    $4,%rdi
  93         decl    %ecx
  94         jnz     2b
  95 3:                                      // handle leftover bytes (0..3) in last word
  96         andl    $3,%edx                 // any leftover bytes?
  97         jz      5f
  98 4:                                      // loop copying bytes
  99         movb    (%rsi),%al
 100         incq    %rsi
 101         movb    %al,(%rdi)
 102         incq    %rdi
 103         decl    %edx
 104         jnz     4b
 105 5:
 106         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 107         popq    %rbp
 108         ret
 109
 110
 111 LReverseIsland:                         // keep the "jb" above a short branch...
 112         jmp     LReverse                // ...because reverse moves are uncommon
 113
 114
 115 // Handle forward moves that are long enough to justify use of SSE.
 116 // First, 16-byte align the destination.
 117 //      rdx = length (> kShort)
 118 //      rsi = source ptr
 119 //      rdi = dest ptr
 120
 121 LNotShort:
 122         movl    %edi,%ecx               // copy low half of destination ptr
 123         negl    %ecx
 124         andl    $15,%ecx                // get #bytes to align destination
 125         jz      LDestAligned            // already aligned
 126         subl    %ecx,%edx               // decrement length
 127 1:                                      // loop copying 1..15 bytes
 128         movb    (%rsi),%al
 129         inc     %rsi
 130         movb    %al,(%rdi)
 131         inc     %rdi
 132         dec     %ecx
 133         jnz     1b
 134
 135
 136 // Destination is now aligned.  Nehalem does a great job with unaligned SSE loads,
 137 // so we use MOVDQU rather than aligned loads and shifts.  Since kShort>=80, we
 138 // know there is at least one 64-byte chunk to move.
 139 // When we enter the copy loops, the following registers are set up:
 140 //      rdx = residual length (0..63)
 141 //      rcx = -(length to move), a multiple of 64 less than 2GB
 142 //      rsi = ptr to 1st source byte not to move (unaligned)
 143 //      rdi = ptr to 1st dest byte not to move (aligned)
 144
 145 LDestAligned:
 146         movq    %rdx,%rcx               // copy length
 147         andl    $63,%edx                // get remaining bytes for LShort
 148         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 149         addq    %rcx,%rsi               // point to 1st byte not copied
 150         addq    %rcx,%rdi
 151         negq    %rcx                    // now generate offset to 1st byte to be copied
 152         testl   $15,%esi                // source also aligned?
 153         jnz     LUnalignedLoop
 154         jmp     LAlignedLoop
 155
 156
 157 // Forward loop for aligned operands.
 158
 159         .align  4,0x90                  // 16-byte align inner loops
 160 LAlignedLoop:                           // loop over 64-byte chunks
 161         movdqa  (%rsi,%rcx),%xmm0
 162         movdqa  16(%rsi,%rcx),%xmm1
 163         movdqa  32(%rsi,%rcx),%xmm2
 164         movdqa  48(%rsi,%rcx),%xmm3
 165
 166         movdqa  %xmm0,(%rdi,%rcx)
 167         movdqa  %xmm1,16(%rdi,%rcx)
 168         movdqa  %xmm2,32(%rdi,%rcx)
 169         movdqa  %xmm3,48(%rdi,%rcx)
 170
 171         addq    $64,%rcx
 172         jnz     LAlignedLoop
 173
 174         jmp     LShort                  // copy remaining 0..63 bytes and done
 175
 176
 177 // Forward loop for unaligned operands.
 178
 179         .align  4,0x90                  // 16-byte align inner loops
 180 LUnalignedLoop:                         // loop over 64-byte chunks
 181         movdqu  (%rsi,%rcx),%xmm0
 182         movdqu  16(%rsi,%rcx),%xmm1
 183         movdqu  32(%rsi,%rcx),%xmm2
 184         movdqu  48(%rsi,%rcx),%xmm3
 185
 186         movdqa  %xmm0,(%rdi,%rcx)
 187         movdqa  %xmm1,16(%rdi,%rcx)
 188         movdqa  %xmm2,32(%rdi,%rcx)
 189         movdqa  %xmm3,48(%rdi,%rcx)
 190
 191         addq    $64,%rcx
 192         jnz     LUnalignedLoop
 193
 194         jmp     LShort                  // copy remaining 0..63 bytes and done
 195
 196
 197 // Reverse moves.  These are only used with destructive overlap.
 198 //      rdx = length
 199 //      rsi = source ptr
 200 //      rdi = dest ptr
 201
 202 LReverse:
 203         addq    %rdx,%rsi               // point to end of strings
 204         addq    %rdx,%rdi
 205         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
 206         ja      LReverseNotShort        // yes
 207
 208 // Handle reverse short copies.
 209 //      edx = length (<= kShort)
 210 //      rsi = one byte past end of source
 211 //      rdi = one byte past end of dest
 212
 213 LReverseShort:
 214         movl    %edx,%ecx               // copy length
 215         shrl    $3,%ecx                 // #quadwords
 216         jz      3f
 217 1:
 218         subq    $8,%rsi
 219         movq    (%rsi),%rax
 220         subq    $8,%rdi
 221         movq    %rax,(%rdi)
 222         decl    %ecx
 223         jnz     1b
 224 3:
 225         andl    $7,%edx                 // bytes?
 226         jz      5f
 227 4:
 228         decq    %rsi
 229         movb    (%rsi),%al
 230         decq    %rdi
 231         movb    %al,(%rdi)
 232         decl    %edx
 233         jnz     4b
 234 5:
 235         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 236         popq    %rbp
 237         ret
 238
 239 // Handle a reverse move long enough to justify using SSE.
 240 //      rdx = length (> kShort)
 241 //      rsi = one byte past end of source
 242 //      rdi = one byte past end of dest
 243
 244 LReverseNotShort:
 245         movl    %edi,%ecx               // copy destination
 246         andl    $15,%ecx                // get #bytes to align destination
 247         jz      LReverseDestAligned     // already aligned
 248         subq    %rcx,%rdx               // adjust length
 249 1:                                      // loop copying 1..15 bytes
 250         decq    %rsi
 251         movb    (%rsi),%al
 252         decq    %rdi
 253         movb    %al,(%rdi)
 254         decl    %ecx
 255         jnz     1b
 256
 257 // Destination is now aligned.  Prepare for reverse loops.
 258
 259 LReverseDestAligned:
 260         movq    %rdx,%rcx               // copy length
 261         andl    $63,%edx                // get remaining bytes for LReverseShort
 262         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 263         subq    %rcx,%rsi               // point to endpoint of copy
 264         subq    %rcx,%rdi
 265         testl   $15,%esi                // is source aligned too?
 266         jnz     LReverseUnalignedLoop   // no
 267
 268 LReverseAlignedLoop:                    // loop over 64-byte chunks
 269         movdqa  -16(%rsi,%rcx),%xmm0
 270         movdqa  -32(%rsi,%rcx),%xmm1
 271         movdqa  -48(%rsi,%rcx),%xmm2
 272         movdqa  -64(%rsi,%rcx),%xmm3
 273
 274         movdqa  %xmm0,-16(%rdi,%rcx)
 275         movdqa  %xmm1,-32(%rdi,%rcx)
 276         movdqa  %xmm2,-48(%rdi,%rcx)
 277         movdqa  %xmm3,-64(%rdi,%rcx)
 278
 279         subq    $64,%rcx
 280         jne     LReverseAlignedLoop
 281
 282         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 283
 284
 285 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 286
 287 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 288         movdqu  -16(%rsi,%rcx),%xmm0
 289         movdqu  -32(%rsi,%rcx),%xmm1
 290         movdqu  -48(%rsi,%rcx),%xmm2
 291         movdqu  -64(%rsi,%rcx),%xmm3
 292
 293         movdqa  %xmm0,-16(%rdi,%rcx)
 294         movdqa  %xmm1,-32(%rdi,%rcx)
 295         movdqa  %xmm2,-48(%rdi,%rcx)
 296         movdqa  %xmm3,-64(%rdi,%rcx)
 297
 298         subq    $64,%rcx
 299         jne     LReverseUnalignedLoop
 300
 301         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 302
 303
 304         COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0)