osfmk/i386/commpage/bcopy_sse3x.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for Pentium-M class processors with
  34  * Supplemental SSE3 and 64-byte cache lines.
  35  *
  36  * The following #defines are tightly coupled to the u-architecture:
  37  */
  38
  39 #define kShort  80                      // too short to bother with SSE (must be >=80)
  40 #define kVeryLong   (500*1024)          // large enough for non-temporal stores (must be >= 8192)
  41 #define kFastUCode  ((16*1024)-15)      // cutoff for microcode fastpath for "rep/movsl"
  42
  43 // void bcopy(const void *src, void *dst, size_t len);
  44
  45 COMMPAGE_FUNCTION_START(bcopy_sse3x, 32, 5)
  46 LZero:
  47         pushl   %ebp                    // set up a frame for backtraces
  48         movl    %esp,%ebp
  49         pushl   %esi
  50         pushl   %edi
  51         movl    8(%ebp),%esi            // get source ptr
  52         movl    12(%ebp),%edi           // get dest ptr
  53         movl    16(%ebp),%ecx           // get length
  54         movl    %edi,%edx
  55         subl    %esi,%edx               // (dest - source)
  56         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  57         jb      LReverseIsland
  58         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  59         jbe     Lshort                  // no
  60         jmp     LNotShort
  61
  62 //
  63 // void *memcpy(void *dst, const void *src, size_t len);
  64 // void *memmove(void *dst, const void *src, size_t len);
  65 //
  66 // NB: These need to be 32 bytes from bcopy():
  67 //
  68
  69         .align  5, 0x90
  70 Lmemcpy:                                // void *memcpy(void *dst, const void *src, size_t len)
  71 Lmemmove:                               // void *memmove(void *dst, const void *src, size_t len)
  72         pushl   %ebp                    // set up a frame for backtraces
  73         movl    %esp,%ebp
  74         pushl   %esi
  75         pushl   %edi
  76         movl    8(%ebp),%edi            // get dest ptr
  77         movl    12(%ebp),%esi           // get source ptr
  78         movl    16(%ebp),%ecx           // get length
  79         movl    %edi,%edx
  80         subl    %esi,%edx               // (dest - source)
  81         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  82         jb      LReverseIsland
  83         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  84         ja      LNotShort               // yes
  85
  86 // Handle short forward copies.  As the most common case, this is the fall-through path.
  87 //      ecx = length (<= kShort)
  88 //      esi = source ptr
  89 //      edi = dest ptr
  90
  91 Lshort:
  92         movl    %ecx,%edx               // copy length
  93         shrl    $2,%ecx                 // get #doublewords
  94         jz      LLeftovers
  95 2:                                      // loop copying doublewords
  96         movl    (%esi),%eax
  97         addl    $4,%esi
  98         movl    %eax,(%edi)
  99         addl    $4,%edi
 100         dec     %ecx
 101         jnz     2b
 102 LLeftovers:                             // handle leftover bytes (0..3) in last word
 103         andl    $3,%edx                 // any leftover bytes?
 104         jz      Lexit
 105 4:                                      // loop copying bytes
 106         movb    (%esi),%al
 107         inc     %esi
 108         movb    %al,(%edi)
 109         inc     %edi
 110         dec     %edx
 111         jnz     4b
 112 Lexit:
 113         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 114         popl    %edi
 115         popl    %esi
 116         popl    %ebp
 117         ret
 118
 119
 120 LReverseIsland:                         // keep the "jb" above a short branch...
 121         jmp     LReverse                // ...because reverse moves are uncommon
 122
 123
 124 // Handle forward moves that are long enough to justify use of SSE3.
 125 // First, 16-byte align the destination.
 126 //      ecx = length (> kShort)
 127 //      esi = source ptr
 128 //      edi = dest ptr
 129
 130 LNotShort:
 131         cmpl    $(kVeryLong),%ecx       // long enough to justify heavyweight loops?
 132         movl    %edi,%edx               // copy destination
 133         jae     LVeryLong               // use very-long-operand path
 134         negl    %edx
 135         andl    $15,%edx                // get #bytes to align destination
 136         jz      LDestAligned            // already aligned
 137         subl    %edx,%ecx               // decrement length
 138 1:                                      // loop copying 1..15 bytes
 139         movb    (%esi),%al
 140         inc     %esi
 141         movb    %al,(%edi)
 142         inc     %edi
 143         dec     %edx
 144         jnz     1b
 145
 146 // Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
 147 // based on the alignment of the source.  All vector loads and stores are aligned.
 148 // Even though this means we have to shift and repack vectors, doing so is much faster
 149 // than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
 150 // there is at least one chunk.  When we enter the copy loops, the following registers
 151 // are set up:
 152 //      ecx = residual length (0..63)
 153 //      edx = -(length to move), a multiple of 64
 154 //      esi = ptr to 1st source byte not to move (unaligned)
 155 //      edi = ptr to 1st dest byte not to move (aligned)
 156
 157 LDestAligned:
 158         movl    %ecx,%edx               // copy length
 159         movl    %esi,%eax               // copy source address
 160         andl    $63,%ecx                // get remaining bytes for Lshort
 161         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 162         andl    $15,%eax                // mask to low 4 bits of source address
 163         addl    %edx,%esi               // point to 1st byte not copied
 164         addl    %edx,%edi
 165         negl    %edx                    // now generate offset to 1st byte to be copied
 166 .set LTableOffset, LTable - LZero
 167         leal    (LTableOffset)(,%eax,4), %eax   // load jump table entry address, relative to LZero
 168         movl    _COMM_PAGE_BCOPY(%eax), %eax    // load jump table entry
 169         addl    $(_COMM_PAGE_BCOPY), %eax       // add runtime address of LZero to get final function
 170         jmp     *%eax
 171
 172         .align  2
 173 LTable:                                 // table of copy loop addresses
 174 // force generation of assembly-time constants. Otherwise assembler
 175 // creates subtractor relocations relative to first external symbol,
 176 // and this file has none
 177 .set LMod0Offset, LMod0 - LZero
 178 .set LMod1Offset, LMod1 - LZero
 179 .set LMod2Offset, LMod2 - LZero
 180 .set LMod3Offset, LMod3 - LZero
 181 .set LMod4Offset, LMod4 - LZero
 182 .set LMod5Offset, LMod5 - LZero
 183 .set LMod6Offset, LMod6 - LZero
 184 .set LMod7Offset, LMod7 - LZero
 185 .set LMod8Offset, LMod8 - LZero
 186 .set LMod9Offset, LMod9 - LZero
 187 .set LMod10Offset, LMod10 - LZero
 188 .set LMod11Offset, LMod11 - LZero
 189 .set LMod12Offset, LMod12 - LZero
 190 .set LMod13Offset, LMod13 - LZero
 191 .set LMod14Offset, LMod14 - LZero
 192 .set LMod15Offset, LMod15 - LZero
 193         .long LMod0Offset
 194         .long LMod1Offset
 195         .long LMod2Offset
 196         .long LMod3Offset
 197         .long LMod4Offset
 198         .long LMod5Offset
 199         .long LMod6Offset
 200         .long LMod7Offset
 201         .long LMod8Offset
 202         .long LMod9Offset
 203         .long LMod10Offset
 204         .long LMod11Offset
 205         .long LMod12Offset
 206         .long LMod13Offset
 207         .long LMod14Offset
 208         .long LMod15Offset
 209
 210
 211 // Very long forward moves.  These are at least several pages.  They are special cased
 212 // and aggressively optimized, not so much because they are common or useful, but
 213 // because they are subject to benchmark.  There isn't enough room for them in the
 214 // area reserved on the commpage for bcopy, so we put them elsewhere.  We call
 215 // the longcopy routine using the normal ABI.
 216
 217 LVeryLong:
 218         pushl   %ecx                    // length (>= kVeryLong)
 219         pushl   %esi                    // source ptr
 220         pushl   %edi                    // dest ptr
 221         movl    $(_COMM_PAGE_LONGCOPY),%eax
 222         call    *%eax                   // do the long copy
 223         addl    $12,%esp                // pop off our parameters
 224         jmp     Lexit
 225
 226
 227 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
 228 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
 229 // about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
 230 // avoids having to read destination cache lines that will be completely overwritten.
 231 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
 232 // we do not know if the destination is in cache or not.
 233
 234 Lfastpath:
 235         addl    %edx,%esi               // restore ptrs to 1st byte of source and dest
 236         addl    %edx,%edi
 237         negl    %edx                    // make length positive
 238         orl     %edx,%ecx               // restore total #bytes remaining to move
 239         cld                             // we'll move forward
 240         movl    %ecx,%edx               // copy total length to move
 241         shrl    $2,%ecx                 // compute #words to move
 242         rep                             // the u-code will optimize this
 243         movsl
 244         jmp     LLeftovers              // handle 0..3 leftover bytes
 245
 246
 247 // Forward loop for medium length operands in which low four bits of %esi == 0000
 248
 249 LMod0:
 250         cmpl    $(-kFastUCode),%edx     // %edx == -length, where (length < kVeryLong)
 251         jle     Lfastpath               // long enough for fastpath in microcode
 252         jmp     1f
 253         .align  4,0x90                  // 16-byte align inner loops
 254 1:                                      // loop over 64-byte chunks
 255         movdqa  (%esi,%edx),%xmm0
 256         movdqa  16(%esi,%edx),%xmm1
 257         movdqa  32(%esi,%edx),%xmm2
 258         movdqa  48(%esi,%edx),%xmm3
 259
 260         movdqa  %xmm0,(%edi,%edx)
 261         movdqa  %xmm1,16(%edi,%edx)
 262         movdqa  %xmm2,32(%edi,%edx)
 263         movdqa  %xmm3,48(%edi,%edx)
 264
 265         addl    $64,%edx
 266         jnz     1b
 267
 268         jmp     Lshort                  // copy remaining 0..63 bytes and done
 269
 270
 271 // Forward loop for medium length operands in which low four bits of %esi == 0001
 272
 273 LMod1:
 274         movdqa  -1(%esi,%edx),%xmm0     // prime the loop by loading 1st quadword
 275 1:                                      // loop over 64-byte chunks
 276         movdqa  15(%esi,%edx),%xmm1
 277         movdqa  31(%esi,%edx),%xmm2
 278         movdqa  47(%esi,%edx),%xmm3
 279         movdqa  63(%esi,%edx),%xmm4
 280
 281         movdqa  %xmm0,%xmm5
 282         movdqa  %xmm4,%xmm0
 283
 284         palignr $1,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 285         palignr $1,%xmm2,%xmm3
 286         palignr $1,%xmm1,%xmm2
 287         palignr $1,%xmm5,%xmm1
 288
 289         movdqa  %xmm1,(%edi,%edx)
 290         movdqa  %xmm2,16(%edi,%edx)
 291         movdqa  %xmm3,32(%edi,%edx)
 292         movdqa  %xmm4,48(%edi,%edx)
 293
 294         addl    $64,%edx
 295         jnz     1b
 296
 297         jmp     Lshort                  // copy remaining 0..63 bytes and done
 298
 299
 300 // Forward loop for medium length operands in which low four bits of %esi == 0010
 301
 302 LMod2:
 303         movdqa  -2(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 304 1:                                      // loop over 64-byte chunks
 305         movdqa  14(%esi,%edx),%xmm1
 306         movdqa  30(%esi,%edx),%xmm2
 307         movdqa  46(%esi,%edx),%xmm3
 308         movdqa  62(%esi,%edx),%xmm4
 309
 310         movdqa  %xmm0,%xmm5
 311         movdqa  %xmm4,%xmm0
 312
 313         palignr $2,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 314         palignr $2,%xmm2,%xmm3
 315         palignr $2,%xmm1,%xmm2
 316         palignr $2,%xmm5,%xmm1
 317
 318         movdqa  %xmm1,(%edi,%edx)
 319         movdqa  %xmm2,16(%edi,%edx)
 320         movdqa  %xmm3,32(%edi,%edx)
 321         movdqa  %xmm4,48(%edi,%edx)
 322
 323         addl    $64,%edx
 324         jnz     1b
 325
 326         jmp     Lshort                  // copy remaining 0..63 bytes and done
 327
 328
 329 // Forward loop for medium length operands in which low four bits of %esi == 0011
 330
 331 LMod3:
 332         movdqa  -3(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 333 1:                                      // loop over 64-byte chunks
 334         movdqa  13(%esi,%edx),%xmm1
 335         movdqa  29(%esi,%edx),%xmm2
 336         movdqa  45(%esi,%edx),%xmm3
 337         movdqa  61(%esi,%edx),%xmm4
 338
 339         movdqa  %xmm0,%xmm5
 340         movdqa  %xmm4,%xmm0
 341
 342         palignr $3,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 343         palignr $3,%xmm2,%xmm3
 344         palignr $3,%xmm1,%xmm2
 345         palignr $3,%xmm5,%xmm1
 346
 347         movdqa  %xmm1,(%edi,%edx)
 348         movdqa  %xmm2,16(%edi,%edx)
 349         movdqa  %xmm3,32(%edi,%edx)
 350         movdqa  %xmm4,48(%edi,%edx)
 351
 352         addl    $64,%edx
 353         jnz     1b
 354
 355         jmp     Lshort                  // copy remaining 0..63 bytes and done
 356
 357
 358 // Forward loop for medium length operands in which low four bits of %esi == 0100
 359 // We use the float single data type in order to use "movss" to merge vectors.
 360
 361 LMod4:
 362         movaps  -4(%esi,%edx),%xmm0     // 4-byte aligned: prime the loop
 363         jmp     1f
 364         .align  4,0x90
 365 1:                                      // loop over 64-byte chunks
 366         movaps  12(%esi,%edx),%xmm1
 367         movaps  28(%esi,%edx),%xmm2
 368         movss   %xmm1,%xmm0             // copy low 4 bytes of source into destination
 369         pshufd  $(0x39),%xmm0,%xmm0     // rotate right 4 bytes (mask -- 00 11 10 01)
 370         movaps  44(%esi,%edx),%xmm3
 371         movss   %xmm2,%xmm1
 372         pshufd  $(0x39),%xmm1,%xmm1
 373         movaps  60(%esi,%edx),%xmm4
 374         movss   %xmm3,%xmm2
 375         pshufd  $(0x39),%xmm2,%xmm2
 376
 377         movaps  %xmm0,(%edi,%edx)
 378         movss   %xmm4,%xmm3
 379         pshufd  $(0x39),%xmm3,%xmm3
 380         movaps  %xmm1,16(%edi,%edx)
 381         movaps  %xmm2,32(%edi,%edx)
 382         movaps  %xmm4,%xmm0
 383         movaps  %xmm3,48(%edi,%edx)
 384
 385         addl    $64,%edx
 386         jnz     1b
 387
 388         jmp     Lshort                  // copy remaining 0..63 bytes and done
 389
 390
 391 // Forward loop for medium length operands in which low four bits of %esi == 0101
 392
 393 LMod5:
 394         movdqa  -5(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 395 1:                                      // loop over 64-byte chunks
 396         movdqa  11(%esi,%edx),%xmm1
 397         movdqa  27(%esi,%edx),%xmm2
 398         movdqa  43(%esi,%edx),%xmm3
 399         movdqa  59(%esi,%edx),%xmm4
 400
 401         movdqa  %xmm0,%xmm5
 402         movdqa  %xmm4,%xmm0
 403
 404         palignr $5,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 405         palignr $5,%xmm2,%xmm3
 406         palignr $5,%xmm1,%xmm2
 407         palignr $5,%xmm5,%xmm1
 408
 409         movdqa  %xmm1,(%edi,%edx)
 410         movdqa  %xmm2,16(%edi,%edx)
 411         movdqa  %xmm3,32(%edi,%edx)
 412         movdqa  %xmm4,48(%edi,%edx)
 413
 414         addl    $64,%edx
 415         jnz     1b
 416
 417         jmp     Lshort                  // copy remaining 0..63 bytes and done
 418
 419
 420 // Forward loop for medium length operands in which low four bits of %esi == 0110
 421
 422 LMod6:
 423         movdqa  -6(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 424 1:                                      // loop over 64-byte chunks
 425         movdqa  10(%esi,%edx),%xmm1
 426         movdqa  26(%esi,%edx),%xmm2
 427         movdqa  42(%esi,%edx),%xmm3
 428         movdqa  58(%esi,%edx),%xmm4
 429
 430         movdqa  %xmm0,%xmm5
 431         movdqa  %xmm4,%xmm0
 432
 433         palignr $6,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 434         palignr $6,%xmm2,%xmm3
 435         palignr $6,%xmm1,%xmm2
 436         palignr $6,%xmm5,%xmm1
 437
 438         movdqa  %xmm1,(%edi,%edx)
 439         movdqa  %xmm2,16(%edi,%edx)
 440         movdqa  %xmm3,32(%edi,%edx)
 441         movdqa  %xmm4,48(%edi,%edx)
 442
 443         addl    $64,%edx
 444         jnz     1b
 445
 446         jmp     Lshort                  // copy remaining 0..63 bytes and done
 447
 448
 449 // Forward loop for medium length operands in which low four bits of %esi == 0111
 450
 451 LMod7:
 452         movdqa  -7(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 453 1:                                      // loop over 64-byte chunks
 454         movdqa  9(%esi,%edx),%xmm1
 455         movdqa  25(%esi,%edx),%xmm2
 456         movdqa  41(%esi,%edx),%xmm3
 457         movdqa  57(%esi,%edx),%xmm4
 458
 459         movdqa  %xmm0,%xmm5
 460         movdqa  %xmm4,%xmm0
 461
 462         palignr $7,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 463         palignr $7,%xmm2,%xmm3
 464         palignr $7,%xmm1,%xmm2
 465         palignr $7,%xmm5,%xmm1
 466
 467         movdqa  %xmm1,(%edi,%edx)
 468         movdqa  %xmm2,16(%edi,%edx)
 469         movdqa  %xmm3,32(%edi,%edx)
 470         movdqa  %xmm4,48(%edi,%edx)
 471
 472         addl    $64,%edx
 473         jnz     1b
 474
 475         jmp     Lshort                  // copy remaining 0..63 bytes and done
 476
 477
 478 // Forward loop for medium length operands in which low four bits of %esi == 1000
 479 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
 480
 481 LMod8:
 482         cmpl    $(-kFastUCode),%edx     // %edx == -length, where (length < kVeryLong)
 483         jle     Lfastpath               // long enough for fastpath in microcode
 484         movapd  -8(%esi,%edx),%xmm0     // 8-byte aligned: prime the loop
 485         jmp     1f
 486         .align  4,0x90
 487 1:                                      // loop over 64-byte chunks
 488         movapd  8(%esi,%edx),%xmm1
 489         movapd  24(%esi,%edx),%xmm2
 490         shufpd  $01,%xmm1,%xmm0         // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
 491         movapd  40(%esi,%edx),%xmm3
 492         shufpd  $01,%xmm2,%xmm1
 493         movapd  56(%esi,%edx),%xmm4
 494         shufpd  $01,%xmm3,%xmm2
 495
 496         movapd  %xmm0,(%edi,%edx)
 497         shufpd  $01,%xmm4,%xmm3
 498         movapd  %xmm1,16(%edi,%edx)
 499         movapd  %xmm2,32(%edi,%edx)
 500         movapd  %xmm4,%xmm0
 501         movapd  %xmm3,48(%edi,%edx)
 502
 503         addl    $64,%edx
 504         jnz     1b
 505
 506         jmp     Lshort                  // copy remaining 0..63 bytes and done
 507
 508
 509 // Forward loop for medium length operands in which low four bits of %esi == 1001
 510
 511 LMod9:
 512         movdqa  -9(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 513 1:                                      // loop over 64-byte chunks
 514         movdqa  7(%esi,%edx),%xmm1
 515         movdqa  23(%esi,%edx),%xmm2
 516         movdqa  39(%esi,%edx),%xmm3
 517         movdqa  55(%esi,%edx),%xmm4
 518
 519         movdqa  %xmm0,%xmm5
 520         movdqa  %xmm4,%xmm0
 521
 522         palignr $9,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 523         palignr $9,%xmm2,%xmm3
 524         palignr $9,%xmm1,%xmm2
 525         palignr $9,%xmm5,%xmm1
 526
 527         movdqa  %xmm1,(%edi,%edx)
 528         movdqa  %xmm2,16(%edi,%edx)
 529         movdqa  %xmm3,32(%edi,%edx)
 530         movdqa  %xmm4,48(%edi,%edx)
 531
 532         addl    $64,%edx
 533         jnz     1b
 534
 535         jmp     Lshort                  // copy remaining 0..63 bytes and done
 536
 537
 538 // Forward loop for medium length operands in which low four bits of %esi == 1010
 539
 540 LMod10:
 541         movdqa  -10(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 542 1:                                      // loop over 64-byte chunks
 543         movdqa  6(%esi,%edx),%xmm1
 544         movdqa  22(%esi,%edx),%xmm2
 545         movdqa  38(%esi,%edx),%xmm3
 546         movdqa  54(%esi,%edx),%xmm4
 547
 548         movdqa  %xmm0,%xmm5
 549         movdqa  %xmm4,%xmm0
 550
 551         palignr $10,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 552         palignr $10,%xmm2,%xmm3
 553         palignr $10,%xmm1,%xmm2
 554         palignr $10,%xmm5,%xmm1
 555
 556         movdqa  %xmm1,(%edi,%edx)
 557         movdqa  %xmm2,16(%edi,%edx)
 558         movdqa  %xmm3,32(%edi,%edx)
 559         movdqa  %xmm4,48(%edi,%edx)
 560
 561         addl    $64,%edx
 562         jnz     1b
 563
 564         jmp     Lshort                  // copy remaining 0..63 bytes and done
 565
 566
 567 // Forward loop for medium length operands in which low four bits of %esi == 1011
 568
 569 LMod11:
 570         movdqa  -11(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 571 1:                                      // loop over 64-byte chunks
 572         movdqa  5(%esi,%edx),%xmm1
 573         movdqa  21(%esi,%edx),%xmm2
 574         movdqa  37(%esi,%edx),%xmm3
 575         movdqa  53(%esi,%edx),%xmm4
 576
 577         movdqa  %xmm0,%xmm5
 578         movdqa  %xmm4,%xmm0
 579
 580         palignr $11,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 581         palignr $11,%xmm2,%xmm3
 582         palignr $11,%xmm1,%xmm2
 583         palignr $11,%xmm5,%xmm1
 584
 585         movdqa  %xmm1,(%edi,%edx)
 586         movdqa  %xmm2,16(%edi,%edx)
 587         movdqa  %xmm3,32(%edi,%edx)
 588         movdqa  %xmm4,48(%edi,%edx)
 589
 590         addl    $64,%edx
 591         jnz     1b
 592
 593         jmp     Lshort                  // copy remaining 0..63 bytes and done
 594
 595
 596 // Forward loop for medium length operands in which low four bits of %esi == 1100
 597 // We use the float single data type in order to use "movss" to merge vectors.
 598
 599 LMod12:
 600         movss   (%esi,%edx),%xmm0       // prefetch 1st four bytes of source, right justified
 601         jmp     1f
 602         .align  4,0x90
 603 1:                                      // loop over 64-byte chunks
 604         pshufd  $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
 605         pshufd  $(0x93),20(%esi,%edx),%xmm2
 606         pshufd  $(0x93),36(%esi,%edx),%xmm3
 607         pshufd  $(0x93),52(%esi,%edx),%xmm4
 608
 609         movaps  %xmm4,%xmm5
 610         movss   %xmm3,%xmm4             // copy low 4 bytes of source into destination
 611         movss   %xmm2,%xmm3
 612         movss   %xmm1,%xmm2
 613         movss   %xmm0,%xmm1
 614
 615         movaps  %xmm1,(%edi,%edx)
 616         movaps  %xmm2,16(%edi,%edx)
 617         movaps  %xmm5,%xmm0
 618         movaps  %xmm3,32(%edi,%edx)
 619         movaps  %xmm4,48(%edi,%edx)
 620
 621         addl    $64,%edx
 622         jnz     1b
 623
 624         jmp     Lshort                  // copy remaining 0..63 bytes and done
 625
 626
 627 // Forward loop for medium length operands in which low four bits of %esi == 1101
 628
 629 LMod13:
 630         movdqa  -13(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 631 1:                                      // loop over 64-byte chunks
 632         movdqa  3(%esi,%edx),%xmm1
 633         movdqa  19(%esi,%edx),%xmm2
 634         movdqa  35(%esi,%edx),%xmm3
 635         movdqa  51(%esi,%edx),%xmm4
 636
 637         movdqa  %xmm0,%xmm5
 638         movdqa  %xmm4,%xmm0
 639
 640         palignr $13,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 641         palignr $13,%xmm2,%xmm3
 642         palignr $13,%xmm1,%xmm2
 643         palignr $13,%xmm5,%xmm1
 644
 645         movdqa  %xmm1,(%edi,%edx)
 646         movdqa  %xmm2,16(%edi,%edx)
 647         movdqa  %xmm3,32(%edi,%edx)
 648         movdqa  %xmm4,48(%edi,%edx)
 649
 650         addl    $64,%edx
 651         jnz     1b
 652
 653         jmp     Lshort                  // copy remaining 0..63 bytes and done
 654
 655
 656 // Forward loop for medium length operands in which low four bits of %esi == 1110
 657
 658 LMod14:
 659         movdqa  -14(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 660 1:                                      // loop over 64-byte chunks
 661         movdqa  2(%esi,%edx),%xmm1
 662         movdqa  18(%esi,%edx),%xmm2
 663         movdqa  34(%esi,%edx),%xmm3
 664         movdqa  50(%esi,%edx),%xmm4
 665
 666         movdqa  %xmm0,%xmm5
 667         movdqa  %xmm4,%xmm0
 668
 669         palignr $14,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 670         palignr $14,%xmm2,%xmm3
 671         palignr $14,%xmm1,%xmm2
 672         palignr $14,%xmm5,%xmm1
 673
 674         movdqa  %xmm1,(%edi,%edx)
 675         movdqa  %xmm2,16(%edi,%edx)
 676         movdqa  %xmm3,32(%edi,%edx)
 677         movdqa  %xmm4,48(%edi,%edx)
 678
 679         addl    $64,%edx
 680         jnz     1b
 681
 682         jmp     Lshort                  // copy remaining 0..63 bytes and done
 683
 684
 685 // Forward loop for medium length operands in which low four bits of %esi == 1111
 686
 687 LMod15:
 688         movdqa  -15(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 689 1:                                      // loop over 64-byte chunks
 690         movdqa  1(%esi,%edx),%xmm1
 691         movdqa  17(%esi,%edx),%xmm2
 692         movdqa  33(%esi,%edx),%xmm3
 693         movdqa  49(%esi,%edx),%xmm4
 694
 695         movdqa  %xmm0,%xmm5
 696         movdqa  %xmm4,%xmm0
 697
 698         palignr $15,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 699         palignr $15,%xmm2,%xmm3
 700         palignr $15,%xmm1,%xmm2
 701         palignr $15,%xmm5,%xmm1
 702
 703         movdqa  %xmm1,(%edi,%edx)
 704         movdqa  %xmm2,16(%edi,%edx)
 705         movdqa  %xmm3,32(%edi,%edx)
 706         movdqa  %xmm4,48(%edi,%edx)
 707
 708         addl    $64,%edx
 709         jnz     1b
 710
 711         jmp     Lshort                  // copy remaining 0..63 bytes and done
 712
 713
 714 // Reverse moves.  These are not optimized as aggressively as their forward
 715 // counterparts, as they are only used with destructive overlap.
 716 //      ecx = length
 717 //      esi = source ptr
 718 //      edi = dest ptr
 719
 720 LReverse:
 721         addl    %ecx,%esi               // point to end of strings
 722         addl    %ecx,%edi
 723         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
 724         ja      LReverseNotShort        // yes
 725
 726 // Handle reverse short copies.
 727 //      ecx = length
 728 //      esi = one byte past end of source
 729 //      edi = one byte past end of dest
 730
 731 LReverseShort:
 732         movl    %ecx,%edx               // copy length
 733         shrl    $2,%ecx                 // #words
 734         jz      3f
 735 1:
 736         subl    $4,%esi
 737         movl    (%esi),%eax
 738         subl    $4,%edi
 739         movl    %eax,(%edi)
 740         dec     %ecx
 741         jnz     1b
 742 3:
 743         andl    $3,%edx                 // bytes?
 744         jz      5f
 745 4:
 746         dec     %esi
 747         movb    (%esi),%al
 748         dec     %edi
 749         movb    %al,(%edi)
 750         dec     %edx
 751         jnz     4b
 752 5:
 753         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 754         popl    %edi
 755         popl    %esi
 756         popl    %ebp
 757         ret
 758
 759 // Handle a reverse move long enough to justify using SSE.
 760 //      ecx = length
 761 //      esi = one byte past end of source
 762 //      edi = one byte past end of dest
 763
 764 LReverseNotShort:
 765         movl    %edi,%edx               // copy destination
 766         andl    $15,%edx                // get #bytes to align destination
 767         je      LReverseDestAligned     // already aligned
 768         subl    %edx,%ecx               // adjust length
 769 1:                                      // loop copying 1..15 bytes
 770         dec     %esi
 771         movb    (%esi),%al
 772         dec     %edi
 773         movb    %al,(%edi)
 774         dec     %edx
 775         jnz     1b
 776
 777 // Destination is now aligned.  Prepare for reverse loops.
 778
 779 LReverseDestAligned:
 780         movl    %ecx,%edx               // copy length
 781         andl    $63,%ecx                // get remaining bytes for Lshort
 782         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 783         subl    %edx,%esi               // point to endpoint of copy
 784         subl    %edx,%edi
 785         testl   $15,%esi                // is source aligned too?
 786         jnz     LReverseUnalignedLoop   // no
 787
 788 LReverseAlignedLoop:                    // loop over 64-byte chunks
 789         movdqa  -16(%esi,%edx),%xmm0
 790         movdqa  -32(%esi,%edx),%xmm1
 791         movdqa  -48(%esi,%edx),%xmm2
 792         movdqa  -64(%esi,%edx),%xmm3
 793
 794         movdqa  %xmm0,-16(%edi,%edx)
 795         movdqa  %xmm1,-32(%edi,%edx)
 796         movdqa  %xmm2,-48(%edi,%edx)
 797         movdqa  %xmm3,-64(%edi,%edx)
 798
 799         subl    $64,%edx
 800         jne     LReverseAlignedLoop
 801
 802         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 803
 804
 805 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 806
 807 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 808         movdqu  -16(%esi,%edx),%xmm0
 809         movdqu  -32(%esi,%edx),%xmm1
 810         movdqu  -48(%esi,%edx),%xmm2
 811         movdqu  -64(%esi,%edx),%xmm3
 812
 813         movdqa  %xmm0,-16(%edi,%edx)
 814         movdqa  %xmm1,-32(%edi,%edx)
 815         movdqa  %xmm2,-48(%edi,%edx)
 816         movdqa  %xmm3,-64(%edi,%edx)
 817
 818         subl    $64,%edx
 819         jne     LReverseUnalignedLoop
 820
 821         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 822
 823 COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)