i386/string/bcopy_sse42.s

   1 /*
   2  * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <platfunc.h>
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for Nehalem.
  34  *
  35  * The following #defines are tightly coupled to the u-architecture:
  36  */
  37
  38 #define kShort  80                      // too short to bother with SSE (must be >=80)
  39
  40
  41 // void bcopy(const void *src, void *dst, size_t len);
  42
  43 PLATFUNC_FUNCTION_START(bcopy, sse42, 32, 5)
  44         pushl   %ebp                    // set up a frame for backtraces
  45         movl    %esp,%ebp
  46         pushl   %esi
  47         pushl   %edi
  48         movl    8(%ebp),%esi            // get source ptr
  49         movl    12(%ebp),%edi           // get dest ptr
  50         movl    16(%ebp),%ecx           // get length
  51         movl    %edi,%edx
  52         subl    %esi,%edx               // (dest - source)
  53         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  54         jb      LReverseIsland
  55         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  56         jbe     Lshort                  // no
  57         jmp     LNotShort
  58
  59 //
  60 // void *memcpy(void *dst, const void *src, size_t len);
  61 // void *memmove(void *dst, const void *src, size_t len);
  62 //
  63
  64 PLATFUNC_FUNCTION_START(memcpy, sse42, 32, 0)   // void *memcpy(void *dst, const void *src, size_t len)
  65 PLATFUNC_FUNCTION_START(memmove, sse42, 32, 0)  // void *memmove(void *dst, const void *src, size_t len)
  66         pushl   %ebp                    // set up a frame for backtraces
  67         movl    %esp,%ebp
  68         pushl   %esi
  69         pushl   %edi
  70         movl    8(%ebp),%edi            // get dest ptr
  71         movl    12(%ebp),%esi           // get source ptr
  72         movl    16(%ebp),%ecx           // get length
  73         movl    %edi,%edx
  74         subl    %esi,%edx               // (dest - source)
  75         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  76         jb      LReverseIsland
  77         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  78         ja      LNotShort               // yes
  79
  80 // Handle short forward copies.  As the most common case, this is the fall-through path.
  81 //      ecx = length (<= kShort)
  82 //      esi = source ptr
  83 //      edi = dest ptr
  84
  85 Lshort:
  86         movl    %ecx,%edx               // copy length
  87         shrl    $2,%ecx                 // get #doublewords
  88         jz      3f
  89 2:                                      // loop copying doublewords
  90         movl    (%esi),%eax
  91         addl    $4,%esi
  92         movl    %eax,(%edi)
  93         addl    $4,%edi
  94         dec     %ecx
  95         jnz     2b
  96 3:                                      // handle leftover bytes (0..3) in last word
  97         andl    $3,%edx                 // any leftover bytes?
  98         jz      Lexit
  99 4:                                      // loop copying bytes
 100         movb    (%esi),%al
 101         inc     %esi
 102         movb    %al,(%edi)
 103         inc     %edi
 104         dec     %edx
 105         jnz     4b
 106 Lexit:
 107         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 108         popl    %edi
 109         popl    %esi
 110         popl    %ebp
 111         ret
 112
 113
 114 LReverseIsland:                         // keep the "jb" above a short branch...
 115         jmp     LReverse                // ...because reverse moves are uncommon
 116
 117
 118 // Handle forward moves that are long enough to justify use of SSE.
 119 // First, 16-byte align the destination.
 120 //      ecx = length (> kShort)
 121 //      esi = source ptr
 122 //      edi = dest ptr
 123
 124 LNotShort:
 125         movl    %edi,%edx               // copy destination
 126         negl    %edx
 127         andl    $15,%edx                // get #bytes to align destination
 128         jz      LDestAligned            // already aligned
 129         subl    %edx,%ecx               // decrement length
 130 1:                                      // loop copying 1..15 bytes
 131         movb    (%esi),%al
 132         inc     %esi
 133         movb    %al,(%edi)
 134         inc     %edi
 135         dec     %edx
 136         jnz     1b
 137
 138 // Destination is now aligned.  Nehalem does a great job with unaligned SSE loads,
 139 // so we use MOVDQU rather than aligned loads and shifts.  Since kShort>=80, we
 140 // know there is at least one 64-byte chunk to move.
 141 // When we enter the copy loops, the following registers are set up:
 142 //      ecx = residual length (0..63)
 143 //      edx = -(length to move), a multiple of 64
 144 //      esi = ptr to 1st source byte not to move (unaligned)
 145 //      edi = ptr to 1st dest byte not to move (aligned)
 146
 147 LDestAligned:
 148         movl    %ecx,%edx               // copy length
 149         andl    $63,%ecx                // get remaining bytes for Lshort
 150         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 151         addl    %edx,%esi               // point to 1st byte not copied
 152         addl    %edx,%edi
 153         negl    %edx                    // now generate offset to 1st byte to be copied
 154         testl   $15,%esi                // source also aligned?
 155         jnz     LUnalignedLoop
 156         jmp     LAlignedLoop
 157
 158
 159 // Forward loop for aligned operands.
 160
 161         .align  4,0x90                  // 16-byte align inner loops
 162 LAlignedLoop:                           // loop over 64-byte chunks
 163         movdqa  (%esi,%edx),%xmm0
 164         movdqa  16(%esi,%edx),%xmm1
 165         movdqa  32(%esi,%edx),%xmm2
 166         movdqa  48(%esi,%edx),%xmm3
 167
 168         movdqa  %xmm0,(%edi,%edx)
 169         movdqa  %xmm1,16(%edi,%edx)
 170         movdqa  %xmm2,32(%edi,%edx)
 171         movdqa  %xmm3,48(%edi,%edx)
 172
 173         addl    $64,%edx
 174         jnz     LAlignedLoop
 175
 176         jmp     Lshort                  // copy remaining 0..63 bytes and done
 177
 178
 179 // Forward loop for unaligned operands.
 180
 181         .align  4,0x90                  // 16-byte align inner loops
 182 LUnalignedLoop:                         // loop over 64-byte chunks
 183         movdqu  (%esi,%edx),%xmm0
 184         movdqu  16(%esi,%edx),%xmm1
 185         movdqu  32(%esi,%edx),%xmm2
 186         movdqu  48(%esi,%edx),%xmm3
 187
 188         movdqa  %xmm0,(%edi,%edx)
 189         movdqa  %xmm1,16(%edi,%edx)
 190         movdqa  %xmm2,32(%edi,%edx)
 191         movdqa  %xmm3,48(%edi,%edx)
 192
 193         addl    $64,%edx
 194         jnz     LUnalignedLoop
 195
 196         jmp     Lshort                  // copy remaining 0..63 bytes and done
 197
 198
 199 // Reverse moves.  They are only used with destructive overlap.
 200 //      ecx = length
 201 //      esi = source ptr
 202 //      edi = dest ptr
 203
 204 LReverse:
 205         addl    %ecx,%esi               // point to end of strings
 206         addl    %ecx,%edi
 207         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
 208         ja      LReverseNotShort        // yes
 209
 210 // Handle reverse short copies.
 211 //      ecx = length
 212 //      esi = one byte past end of source
 213 //      edi = one byte past end of dest
 214
 215 LReverseShort:
 216         movl    %ecx,%edx               // copy length
 217         shrl    $2,%ecx                 // #words
 218         jz      3f
 219 1:
 220         subl    $4,%esi
 221         movl    (%esi),%eax
 222         subl    $4,%edi
 223         movl    %eax,(%edi)
 224         dec     %ecx
 225         jnz     1b
 226 3:
 227         andl    $3,%edx                 // bytes?
 228         jz      5f
 229 4:
 230         dec     %esi
 231         movb    (%esi),%al
 232         dec     %edi
 233         movb    %al,(%edi)
 234         dec     %edx
 235         jnz     4b
 236 5:
 237         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 238         popl    %edi
 239         popl    %esi
 240         popl    %ebp
 241         ret
 242
 243 // Handle a reverse move long enough to justify using SSE.
 244 //      ecx = length
 245 //      esi = one byte past end of source
 246 //      edi = one byte past end of dest
 247
 248 LReverseNotShort:
 249         movl    %edi,%edx               // copy destination
 250         andl    $15,%edx                // get #bytes to align destination
 251         je      LReverseDestAligned     // already aligned
 252         subl    %edx,%ecx               // adjust length
 253 1:                                      // loop copying 1..15 bytes
 254         dec     %esi
 255         movb    (%esi),%al
 256         dec     %edi
 257         movb    %al,(%edi)
 258         dec     %edx
 259         jnz     1b
 260
 261 // Destination is now aligned.  Prepare for reverse loops.
 262
 263 LReverseDestAligned:
 264         movl    %ecx,%edx               // copy length
 265         andl    $63,%ecx                // get remaining bytes for Lshort
 266         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 267         subl    %edx,%esi               // point to endpoint of copy
 268         subl    %edx,%edi
 269         testl   $15,%esi                // is source aligned too?
 270         jnz     LReverseUnalignedLoop   // no
 271
 272 LReverseAlignedLoop:                    // loop over 64-byte chunks
 273         movdqa  -16(%esi,%edx),%xmm0
 274         movdqa  -32(%esi,%edx),%xmm1
 275         movdqa  -48(%esi,%edx),%xmm2
 276         movdqa  -64(%esi,%edx),%xmm3
 277
 278         movdqa  %xmm0,-16(%edi,%edx)
 279         movdqa  %xmm1,-32(%edi,%edx)
 280         movdqa  %xmm2,-48(%edi,%edx)
 281         movdqa  %xmm3,-64(%edi,%edx)
 282
 283         subl    $64,%edx
 284         jne     LReverseAlignedLoop
 285
 286         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 287
 288
 289 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 290
 291 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 292         movdqu  -16(%esi,%edx),%xmm0
 293         movdqu  -32(%esi,%edx),%xmm1
 294         movdqu  -48(%esi,%edx),%xmm2
 295         movdqu  -64(%esi,%edx),%xmm3
 296
 297         movdqa  %xmm0,-16(%edi,%edx)
 298         movdqa  %xmm1,-32(%edi,%edx)
 299         movdqa  %xmm2,-48(%edi,%edx)
 300         movdqa  %xmm3,-64(%edi,%edx)
 301
 302         subl    $64,%edx
 303         jne     LReverseUnalignedLoop
 304
 305         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 306
 307
 308 PLATFUNC_DESCRIPTOR(bcopy,sse42,kHasSSE4_2,0)
 309 PLATFUNC_DESCRIPTOR(memcpy,sse42,kHasSSE4_2,0)
 310 PLATFUNC_DESCRIPTOR(memmove,sse42,kHasSSE4_2,0)