osfmk/i386/commpage/bcopy_sse42.s

   1 /*
   2  * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for Nehalem.
  34  *
  35  * The following #defines are tightly coupled to the u-architecture:
  36  */
  37
  38 #define kShort  80                      // too short to bother with SSE (must be >=80)
  39
  40
  41 // void bcopy(const void *src, void *dst, size_t len);
  42
  43 COMMPAGE_FUNCTION_START(bcopy_sse42, 32, 5)
  44         pushl   %ebp                    // set up a frame for backtraces
  45         movl    %esp,%ebp
  46         pushl   %esi
  47         pushl   %edi
  48         movl    8(%ebp),%esi            // get source ptr
  49         movl    12(%ebp),%edi           // get dest ptr
  50         movl    16(%ebp),%ecx           // get length
  51         movl    %edi,%edx
  52         subl    %esi,%edx               // (dest - source)
  53         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  54         jb      LReverseIsland
  55         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  56         jbe     Lshort                  // no
  57         jmp     LNotShort
  58
  59 //
  60 // void *memcpy(void *dst, const void *src, size_t len);
  61 // void *memmove(void *dst, const void *src, size_t len);
  62 //
  63 // NB: These need to be 32 bytes from bcopy():
  64 //
  65
  66         .align  5, 0x90
  67 Lmemcpy:                                // void *memcpy(void *dst, const void *src, size_t len)
  68 Lmemmove:                               // void *memmove(void *dst, const void *src, size_t len)
  69         pushl   %ebp                    // set up a frame for backtraces
  70         movl    %esp,%ebp
  71         pushl   %esi
  72         pushl   %edi
  73         movl    8(%ebp),%edi            // get dest ptr
  74         movl    12(%ebp),%esi           // get source ptr
  75         movl    16(%ebp),%ecx           // get length
  76         movl    %edi,%edx
  77         subl    %esi,%edx               // (dest - source)
  78         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  79         jb      LReverseIsland
  80         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  81         ja      LNotShort               // yes
  82
  83 // Handle short forward copies.  As the most common case, this is the fall-through path.
  84 //      ecx = length (<= kShort)
  85 //      esi = source ptr
  86 //      edi = dest ptr
  87
  88 Lshort:
  89         movl    %ecx,%edx               // copy length
  90         shrl    $2,%ecx                 // get #doublewords
  91         jz      3f
  92 2:                                      // loop copying doublewords
  93         movl    (%esi),%eax
  94         addl    $4,%esi
  95         movl    %eax,(%edi)
  96         addl    $4,%edi
  97         dec     %ecx
  98         jnz     2b
  99 3:                                      // handle leftover bytes (0..3) in last word
 100         andl    $3,%edx                 // any leftover bytes?
 101         jz      Lexit
 102 4:                                      // loop copying bytes
 103         movb    (%esi),%al
 104         inc     %esi
 105         movb    %al,(%edi)
 106         inc     %edi
 107         dec     %edx
 108         jnz     4b
 109 Lexit:
 110         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 111         popl    %edi
 112         popl    %esi
 113         popl    %ebp
 114         ret
 115
 116
 117 LReverseIsland:                         // keep the "jb" above a short branch...
 118         jmp     LReverse                // ...because reverse moves are uncommon
 119
 120
 121 // Handle forward moves that are long enough to justify use of SSE.
 122 // First, 16-byte align the destination.
 123 //      ecx = length (> kShort)
 124 //      esi = source ptr
 125 //      edi = dest ptr
 126
 127 LNotShort:
 128         movl    %edi,%edx               // copy destination
 129         negl    %edx
 130         andl    $15,%edx                // get #bytes to align destination
 131         jz      LDestAligned            // already aligned
 132         subl    %edx,%ecx               // decrement length
 133 1:                                      // loop copying 1..15 bytes
 134         movb    (%esi),%al
 135         inc     %esi
 136         movb    %al,(%edi)
 137         inc     %edi
 138         dec     %edx
 139         jnz     1b
 140
 141 // Destination is now aligned.  Nehalem does a great job with unaligned SSE loads,
 142 // so we use MOVDQU rather than aligned loads and shifts.  Since kShort>=80, we
 143 // know there is at least one 64-byte chunk to move.
 144 // When we enter the copy loops, the following registers are set up:
 145 //      ecx = residual length (0..63)
 146 //      edx = -(length to move), a multiple of 64
 147 //      esi = ptr to 1st source byte not to move (unaligned)
 148 //      edi = ptr to 1st dest byte not to move (aligned)
 149
 150 LDestAligned:
 151         movl    %ecx,%edx               // copy length
 152         andl    $63,%ecx                // get remaining bytes for Lshort
 153         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 154         addl    %edx,%esi               // point to 1st byte not copied
 155         addl    %edx,%edi
 156         negl    %edx                    // now generate offset to 1st byte to be copied
 157         testl   $15,%esi                // source also aligned?
 158         jnz     LUnalignedLoop
 159         jmp     LAlignedLoop
 160
 161
 162 // Forward loop for aligned operands.
 163
 164         .align  4,0x90                  // 16-byte align inner loops
 165 LAlignedLoop:                           // loop over 64-byte chunks
 166         movdqa  (%esi,%edx),%xmm0
 167         movdqa  16(%esi,%edx),%xmm1
 168         movdqa  32(%esi,%edx),%xmm2
 169         movdqa  48(%esi,%edx),%xmm3
 170
 171         movdqa  %xmm0,(%edi,%edx)
 172         movdqa  %xmm1,16(%edi,%edx)
 173         movdqa  %xmm2,32(%edi,%edx)
 174         movdqa  %xmm3,48(%edi,%edx)
 175
 176         addl    $64,%edx
 177         jnz     LAlignedLoop
 178
 179         jmp     Lshort                  // copy remaining 0..63 bytes and done
 180
 181
 182 // Forward loop for unaligned operands.
 183
 184         .align  4,0x90                  // 16-byte align inner loops
 185 LUnalignedLoop:                         // loop over 64-byte chunks
 186         movdqu  (%esi,%edx),%xmm0
 187         movdqu  16(%esi,%edx),%xmm1
 188         movdqu  32(%esi,%edx),%xmm2
 189         movdqu  48(%esi,%edx),%xmm3
 190
 191         movdqa  %xmm0,(%edi,%edx)
 192         movdqa  %xmm1,16(%edi,%edx)
 193         movdqa  %xmm2,32(%edi,%edx)
 194         movdqa  %xmm3,48(%edi,%edx)
 195
 196         addl    $64,%edx
 197         jnz     LUnalignedLoop
 198
 199         jmp     Lshort                  // copy remaining 0..63 bytes and done
 200
 201
 202 // Reverse moves.  They are only used with destructive overlap.
 203 //      ecx = length
 204 //      esi = source ptr
 205 //      edi = dest ptr
 206
 207 LReverse:
 208         addl    %ecx,%esi               // point to end of strings
 209         addl    %ecx,%edi
 210         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
 211         ja      LReverseNotShort        // yes
 212
 213 // Handle reverse short copies.
 214 //      ecx = length
 215 //      esi = one byte past end of source
 216 //      edi = one byte past end of dest
 217
 218 LReverseShort:
 219         movl    %ecx,%edx               // copy length
 220         shrl    $2,%ecx                 // #words
 221         jz      3f
 222 1:
 223         subl    $4,%esi
 224         movl    (%esi),%eax
 225         subl    $4,%edi
 226         movl    %eax,(%edi)
 227         dec     %ecx
 228         jnz     1b
 229 3:
 230         andl    $3,%edx                 // bytes?
 231         jz      5f
 232 4:
 233         dec     %esi
 234         movb    (%esi),%al
 235         dec     %edi
 236         movb    %al,(%edi)
 237         dec     %edx
 238         jnz     4b
 239 5:
 240         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 241         popl    %edi
 242         popl    %esi
 243         popl    %ebp
 244         ret
 245
 246 // Handle a reverse move long enough to justify using SSE.
 247 //      ecx = length
 248 //      esi = one byte past end of source
 249 //      edi = one byte past end of dest
 250
 251 LReverseNotShort:
 252         movl    %edi,%edx               // copy destination
 253         andl    $15,%edx                // get #bytes to align destination
 254         je      LReverseDestAligned     // already aligned
 255         subl    %edx,%ecx               // adjust length
 256 1:                                      // loop copying 1..15 bytes
 257         dec     %esi
 258         movb    (%esi),%al
 259         dec     %edi
 260         movb    %al,(%edi)
 261         dec     %edx
 262         jnz     1b
 263
 264 // Destination is now aligned.  Prepare for reverse loops.
 265
 266 LReverseDestAligned:
 267         movl    %ecx,%edx               // copy length
 268         andl    $63,%ecx                // get remaining bytes for Lshort
 269         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 270         subl    %edx,%esi               // point to endpoint of copy
 271         subl    %edx,%edi
 272         testl   $15,%esi                // is source aligned too?
 273         jnz     LReverseUnalignedLoop   // no
 274
 275 LReverseAlignedLoop:                    // loop over 64-byte chunks
 276         movdqa  -16(%esi,%edx),%xmm0
 277         movdqa  -32(%esi,%edx),%xmm1
 278         movdqa  -48(%esi,%edx),%xmm2
 279         movdqa  -64(%esi,%edx),%xmm3
 280
 281         movdqa  %xmm0,-16(%edi,%edx)
 282         movdqa  %xmm1,-32(%edi,%edx)
 283         movdqa  %xmm2,-48(%edi,%edx)
 284         movdqa  %xmm3,-64(%edi,%edx)
 285
 286         subl    $64,%edx
 287         jne     LReverseAlignedLoop
 288
 289         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 290
 291
 292 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 293
 294 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 295         movdqu  -16(%esi,%edx),%xmm0
 296         movdqu  -32(%esi,%edx),%xmm1
 297         movdqu  -48(%esi,%edx),%xmm2
 298         movdqu  -64(%esi,%edx),%xmm3
 299
 300         movdqa  %xmm0,-16(%edi,%edx)
 301         movdqa  %xmm1,-32(%edi,%edx)
 302         movdqa  %xmm2,-48(%edi,%edx)
 303         movdqa  %xmm3,-64(%edi,%edx)
 304
 305         subl    $64,%edx
 306         jne     LReverseUnalignedLoop
 307
 308         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 309
 310
 311         COMMPAGE_DESCRIPTOR(bcopy_sse42,_COMM_PAGE_BCOPY,kHasSSE4_2,0)