osfmk/i386/commpage/bcopy_sse4.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <machine/cpu_capabilities.h>
  24 #include <machine/commpage.h>
  25
  26 /*
  27  * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE4
  28  * and 64-byte cache lines.
  29  *
  30  * The following #defines are tightly coupled to the u-architecture:
  31  */
  32
  33 #define kShort  80                      // too short to bother with SSE (must be >=80)
  34 #define kVeryLong   (500*1024)          // large enough for non-temporal stores (must be >= 8192)
  35 #define kFastUCode  ((16*1024)-15)      // cutoff for microcode fastpath for "rep/movsl"
  36
  37
  38 // void bcopy(const void *src, void *dst, size_t len);
  39
  40         .text
  41         .align  5, 0x90
  42 LZero:
  43 Lbcopy_sse4:                            // void bcopy(const void *src, void *dst, size_t len)
  44         pushl   %ebp                    // set up a frame for backtraces
  45         movl    %esp,%ebp
  46         pushl   %esi
  47         pushl   %edi
  48         movl    8(%ebp),%esi            // get source ptr
  49         movl    12(%ebp),%edi           // get dest ptr
  50         movl    16(%ebp),%ecx           // get length
  51         movl    %edi,%edx
  52         subl    %esi,%edx               // (dest - source)
  53         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  54         jb      LReverseIsland
  55         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  56         jbe     Lshort                  // no
  57         jmp     LNotShort
  58
  59 //
  60 // void *memcpy(void *dst, const void *src, size_t len);
  61 // void *memmove(void *dst, const void *src, size_t len);
  62 //
  63 // NB: These need to be 32 bytes from bcopy():
  64 //
  65
  66         .align  5, 0x90
  67 Lmemcpy:                                // void *memcpy(void *dst, const void *src, size_t len)
  68 Lmemmove:                               // void *memmove(void *dst, const void *src, size_t len)
  69         pushl   %ebp                    // set up a frame for backtraces
  70         movl    %esp,%ebp
  71         pushl   %esi
  72         pushl   %edi
  73         movl    8(%ebp),%edi            // get dest ptr
  74         movl    12(%ebp),%esi           // get source ptr
  75         movl    16(%ebp),%ecx           // get length
  76         movl    %edi,%edx
  77         subl    %esi,%edx               // (dest - source)
  78         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  79         jb      LReverseIsland
  80         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  81         ja      LNotShort               // yes
  82
  83 // Handle short forward copies.  As the most common case, this is the fall-through path.
  84 //      ecx = length (<= kShort)
  85 //      esi = source ptr
  86 //      edi = dest ptr
  87
  88 Lshort:
  89         movl    %ecx,%edx               // copy length
  90         shrl    $2,%ecx                 // get #doublewords
  91         jz      LLeftovers
  92 2:                                      // loop copying doublewords
  93         movl    (%esi),%eax
  94         addl    $4,%esi
  95         movl    %eax,(%edi)
  96         addl    $4,%edi
  97         dec     %ecx
  98         jnz     2b
  99 LLeftovers:                             // handle leftover bytes (0..3) in last word
 100         andl    $3,%edx                 // any leftover bytes?
 101         jz      Lexit
 102 4:                                      // loop copying bytes
 103         movb    (%esi),%al
 104         inc     %esi
 105         movb    %al,(%edi)
 106         inc     %edi
 107         dec     %edx
 108         jnz     4b
 109 Lexit:
 110         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 111         popl    %edi
 112         popl    %esi
 113         popl    %ebp
 114         ret
 115
 116
 117 LReverseIsland:                         // keep the "jb" above a short branch...
 118         jmp     LReverse                // ...because reverse moves are uncommon
 119
 120
 121 // Handle forward moves that are long enough to justify use of SSE3.
 122 // First, 16-byte align the destination.
 123 //      ecx = length (> kShort)
 124 //      esi = source ptr
 125 //      edi = dest ptr
 126
 127 LNotShort:
 128         cmpl    $(kVeryLong),%ecx       // long enough to justify heavyweight loops?
 129         movl    %edi,%edx               // copy destination
 130         jae     LVeryLong               // use very-long-operand path
 131         negl    %edx
 132         andl    $15,%edx                // get #bytes to align destination
 133         jz      LDestAligned            // already aligned
 134         subl    %edx,%ecx               // decrement length
 135 1:                                      // loop copying 1..15 bytes
 136         movb    (%esi),%al
 137         inc     %esi
 138         movb    %al,(%edi)
 139         inc     %edi
 140         dec     %edx
 141         jnz     1b
 142
 143 // Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
 144 // based on the alignment of the source.  All vector loads and stores are aligned.
 145 // Even though this means we have to shift and repack vectors, doing so is much faster
 146 // than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
 147 // there is at least one chunk.  When we enter the copy loops, the following registers
 148 // are set up:
 149 //      ecx = residual length (0..63)
 150 //      edx = -(length to move), a multiple of 64
 151 //      esi = ptr to 1st source byte not to move (unaligned)
 152 //      edi = ptr to 1st dest byte not to move (aligned)
 153
 154 LDestAligned:
 155         movl    %ecx,%edx               // copy length
 156         movl    %esi,%eax               // copy source address
 157         andl    $63,%ecx                // get remaining bytes for Lshort
 158         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 159         andl    $15,%eax                // mask to low 4 bits of source address
 160         addl    %edx,%esi               // point to 1st byte not copied
 161         addl    %edx,%edi
 162         negl    %edx                    // now generate offset to 1st byte to be copied
 163         movl    (_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax
 164         jmp     *%eax
 165
 166         .align  2
 167 LTable:                                 // table of copy loop addresses
 168         .long   LMod0 + _COMM_PAGE_BCOPY - LZero
 169         .long   LMod1 + _COMM_PAGE_BCOPY - LZero
 170         .long   LMod2 + _COMM_PAGE_BCOPY - LZero
 171         .long   LMod3 + _COMM_PAGE_BCOPY - LZero
 172         .long   LMod4 + _COMM_PAGE_BCOPY - LZero
 173         .long   LMod5 + _COMM_PAGE_BCOPY - LZero
 174         .long   LMod6 + _COMM_PAGE_BCOPY - LZero
 175         .long   LMod7 + _COMM_PAGE_BCOPY - LZero
 176         .long   LMod8 + _COMM_PAGE_BCOPY - LZero
 177         .long   LMod9 + _COMM_PAGE_BCOPY - LZero
 178         .long   LMod10 + _COMM_PAGE_BCOPY - LZero
 179         .long   LMod11 + _COMM_PAGE_BCOPY - LZero
 180         .long   LMod12 + _COMM_PAGE_BCOPY - LZero
 181         .long   LMod13 + _COMM_PAGE_BCOPY - LZero
 182         .long   LMod14 + _COMM_PAGE_BCOPY - LZero
 183         .long   LMod15 + _COMM_PAGE_BCOPY - LZero
 184
 185
 186 // Very long forward moves.  These are at least several pages.  They are special cased
 187 // and aggressively optimized, not so much because they are common or useful, but
 188 // because they are subject to benchmark.  There isn't enough room for them in the
 189 // area reserved on the commpage for bcopy, so we put them elsewhere.  We call
 190 // the longcopy routine using the normal ABI.
 191
 192 LVeryLong:
 193         pushl   %ecx                    // length (>= kVeryLong)
 194         pushl   %esi                    // source ptr
 195         pushl   %edi                    // dest ptr
 196         movl    $(_COMM_PAGE_LONGCOPY),%eax
 197         call    *%eax                   // do the long copy
 198         addl    $12,%esp                // pop off our parameters
 199         jmp     Lexit
 200
 201
 202 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
 203 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
 204 // about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
 205 // avoids having to read destination cache lines that will be completely overwritten.
 206 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
 207 // we do not know if the destination is in cache or not.
 208
 209 Lfastpath:
 210         addl    %edx,%esi               // restore ptrs to 1st byte of source and dest
 211         addl    %edx,%edi
 212         negl    %edx                    // make length positive
 213         orl     %edx,%ecx               // restore total #bytes remaining to move
 214         cld                             // we'll move forward
 215         movl    %ecx,%edx               // copy total length to move
 216         shrl    $2,%ecx                 // compute #words to move
 217         rep                             // the u-code will optimize this
 218         movsl
 219         jmp     LLeftovers              // handle 0..3 leftover bytes
 220
 221
 222 // Forward loop for medium length operands in which low four bits of %esi == 0000
 223
 224 LMod0:
 225         cmpl    $(-kFastUCode),%edx     // %edx == -length, where (length < kVeryLong)
 226         jle     Lfastpath               // long enough for fastpath in microcode
 227         jmp     1f
 228         .align  4,0x90                  // 16-byte align inner loops
 229 1:                                      // loop over 64-byte chunks
 230         movdqa  (%esi,%edx),%xmm0
 231         movdqa  16(%esi,%edx),%xmm1
 232         movdqa  32(%esi,%edx),%xmm2
 233         movdqa  48(%esi,%edx),%xmm3
 234
 235         movdqa  %xmm0,(%edi,%edx)
 236         movdqa  %xmm1,16(%edi,%edx)
 237         movdqa  %xmm2,32(%edi,%edx)
 238         movdqa  %xmm3,48(%edi,%edx)
 239
 240         addl    $64,%edx
 241         jnz     1b
 242
 243         jmp     Lshort                  // copy remaining 0..63 bytes and done
 244
 245
 246 // Forward loop for medium length operands in which low four bits of %esi == 0001
 247
 248 LMod1:
 249         movdqa  -1(%esi,%edx),%xmm0     // prime the loop by loading 1st quadword
 250 1:                                      // loop over 64-byte chunks
 251         movdqa  15(%esi,%edx),%xmm1
 252         movdqa  31(%esi,%edx),%xmm2
 253         movdqa  47(%esi,%edx),%xmm3
 254         movdqa  63(%esi,%edx),%xmm4
 255
 256         movdqa  %xmm0,%xmm5
 257         movdqa  %xmm4,%xmm0
 258
 259         palignr $1,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 260         palignr $1,%xmm2,%xmm3
 261         palignr $1,%xmm1,%xmm2
 262         palignr $1,%xmm5,%xmm1
 263
 264         movdqa  %xmm1,(%edi,%edx)
 265         movdqa  %xmm2,16(%edi,%edx)
 266         movdqa  %xmm3,32(%edi,%edx)
 267         movdqa  %xmm4,48(%edi,%edx)
 268
 269         addl    $64,%edx
 270         jnz     1b
 271
 272         jmp     Lshort                  // copy remaining 0..63 bytes and done
 273
 274
 275 // Forward loop for medium length operands in which low four bits of %esi == 0010
 276
 277 LMod2:
 278         movdqa  -2(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 279 1:                                      // loop over 64-byte chunks
 280         movdqa  14(%esi,%edx),%xmm1
 281         movdqa  30(%esi,%edx),%xmm2
 282         movdqa  46(%esi,%edx),%xmm3
 283         movdqa  62(%esi,%edx),%xmm4
 284
 285         movdqa  %xmm0,%xmm5
 286         movdqa  %xmm4,%xmm0
 287
 288         palignr $2,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 289         palignr $2,%xmm2,%xmm3
 290         palignr $2,%xmm1,%xmm2
 291         palignr $2,%xmm5,%xmm1
 292
 293         movdqa  %xmm1,(%edi,%edx)
 294         movdqa  %xmm2,16(%edi,%edx)
 295         movdqa  %xmm3,32(%edi,%edx)
 296         movdqa  %xmm4,48(%edi,%edx)
 297
 298         addl    $64,%edx
 299         jnz     1b
 300
 301         jmp     Lshort                  // copy remaining 0..63 bytes and done
 302
 303
 304 // Forward loop for medium length operands in which low four bits of %esi == 0011
 305
 306 LMod3:
 307         movdqa  -3(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 308 1:                                      // loop over 64-byte chunks
 309         movdqa  13(%esi,%edx),%xmm1
 310         movdqa  29(%esi,%edx),%xmm2
 311         movdqa  45(%esi,%edx),%xmm3
 312         movdqa  61(%esi,%edx),%xmm4
 313
 314         movdqa  %xmm0,%xmm5
 315         movdqa  %xmm4,%xmm0
 316
 317         palignr $3,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 318         palignr $3,%xmm2,%xmm3
 319         palignr $3,%xmm1,%xmm2
 320         palignr $3,%xmm5,%xmm1
 321
 322         movdqa  %xmm1,(%edi,%edx)
 323         movdqa  %xmm2,16(%edi,%edx)
 324         movdqa  %xmm3,32(%edi,%edx)
 325         movdqa  %xmm4,48(%edi,%edx)
 326
 327         addl    $64,%edx
 328         jnz     1b
 329
 330         jmp     Lshort                  // copy remaining 0..63 bytes and done
 331
 332
 333 // Forward loop for medium length operands in which low four bits of %esi == 0100
 334 // We use the float single data type in order to use "movss" to merge vectors.
 335
 336 LMod4:
 337         movaps  -4(%esi,%edx),%xmm0     // 4-byte aligned: prime the loop
 338         jmp     1f
 339         .align  4,0x90
 340 1:                                      // loop over 64-byte chunks
 341         movaps  12(%esi,%edx),%xmm1
 342         movaps  28(%esi,%edx),%xmm2
 343         movss   %xmm1,%xmm0             // copy low 4 bytes of source into destination
 344         pshufd  $(0x39),%xmm0,%xmm0     // rotate right 4 bytes (mask -- 00 11 10 01)
 345         movaps  44(%esi,%edx),%xmm3
 346         movss   %xmm2,%xmm1
 347         pshufd  $(0x39),%xmm1,%xmm1
 348         movaps  60(%esi,%edx),%xmm4
 349         movss   %xmm3,%xmm2
 350         pshufd  $(0x39),%xmm2,%xmm2
 351
 352         movaps  %xmm0,(%edi,%edx)
 353         movss   %xmm4,%xmm3
 354         pshufd  $(0x39),%xmm3,%xmm3
 355         movaps  %xmm1,16(%edi,%edx)
 356         movaps  %xmm2,32(%edi,%edx)
 357         movaps  %xmm4,%xmm0
 358         movaps  %xmm3,48(%edi,%edx)
 359
 360         addl    $64,%edx
 361         jnz     1b
 362
 363         jmp     Lshort                  // copy remaining 0..63 bytes and done
 364
 365
 366 // Forward loop for medium length operands in which low four bits of %esi == 0101
 367
 368 LMod5:
 369         movdqa  -5(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 370 1:                                      // loop over 64-byte chunks
 371         movdqa  11(%esi,%edx),%xmm1
 372         movdqa  27(%esi,%edx),%xmm2
 373         movdqa  43(%esi,%edx),%xmm3
 374         movdqa  59(%esi,%edx),%xmm4
 375
 376         movdqa  %xmm0,%xmm5
 377         movdqa  %xmm4,%xmm0
 378
 379         palignr $5,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 380         palignr $5,%xmm2,%xmm3
 381         palignr $5,%xmm1,%xmm2
 382         palignr $5,%xmm5,%xmm1
 383
 384         movdqa  %xmm1,(%edi,%edx)
 385         movdqa  %xmm2,16(%edi,%edx)
 386         movdqa  %xmm3,32(%edi,%edx)
 387         movdqa  %xmm4,48(%edi,%edx)
 388
 389         addl    $64,%edx
 390         jnz     1b
 391
 392         jmp     Lshort                  // copy remaining 0..63 bytes and done
 393
 394
 395 // Forward loop for medium length operands in which low four bits of %esi == 0110
 396
 397 LMod6:
 398         movdqa  -6(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 399 1:                                      // loop over 64-byte chunks
 400         movdqa  10(%esi,%edx),%xmm1
 401         movdqa  26(%esi,%edx),%xmm2
 402         movdqa  42(%esi,%edx),%xmm3
 403         movdqa  58(%esi,%edx),%xmm4
 404
 405         movdqa  %xmm0,%xmm5
 406         movdqa  %xmm4,%xmm0
 407
 408         palignr $6,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 409         palignr $6,%xmm2,%xmm3
 410         palignr $6,%xmm1,%xmm2
 411         palignr $6,%xmm5,%xmm1
 412
 413         movdqa  %xmm1,(%edi,%edx)
 414         movdqa  %xmm2,16(%edi,%edx)
 415         movdqa  %xmm3,32(%edi,%edx)
 416         movdqa  %xmm4,48(%edi,%edx)
 417
 418         addl    $64,%edx
 419         jnz     1b
 420
 421         jmp     Lshort                  // copy remaining 0..63 bytes and done
 422
 423
 424 // Forward loop for medium length operands in which low four bits of %esi == 0111
 425
 426 LMod7:
 427         movdqa  -7(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 428 1:                                      // loop over 64-byte chunks
 429         movdqa  9(%esi,%edx),%xmm1
 430         movdqa  25(%esi,%edx),%xmm2
 431         movdqa  41(%esi,%edx),%xmm3
 432         movdqa  57(%esi,%edx),%xmm4
 433
 434         movdqa  %xmm0,%xmm5
 435         movdqa  %xmm4,%xmm0
 436
 437         palignr $7,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 438         palignr $7,%xmm2,%xmm3
 439         palignr $7,%xmm1,%xmm2
 440         palignr $7,%xmm5,%xmm1
 441
 442         movdqa  %xmm1,(%edi,%edx)
 443         movdqa  %xmm2,16(%edi,%edx)
 444         movdqa  %xmm3,32(%edi,%edx)
 445         movdqa  %xmm4,48(%edi,%edx)
 446
 447         addl    $64,%edx
 448         jnz     1b
 449
 450         jmp     Lshort                  // copy remaining 0..63 bytes and done
 451
 452
 453 // Forward loop for medium length operands in which low four bits of %esi == 1000
 454 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
 455
 456 LMod8:
 457         cmpl    $(-kFastUCode),%edx     // %edx == -length, where (length < kVeryLong)
 458         jle     Lfastpath               // long enough for fastpath in microcode
 459         movapd  -8(%esi,%edx),%xmm0     // 8-byte aligned: prime the loop
 460         jmp     1f
 461         .align  4,0x90
 462 1:                                      // loop over 64-byte chunks
 463         movapd  8(%esi,%edx),%xmm1
 464         movapd  24(%esi,%edx),%xmm2
 465         shufpd  $01,%xmm1,%xmm0         // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
 466         movapd  40(%esi,%edx),%xmm3
 467         shufpd  $01,%xmm2,%xmm1
 468         movapd  56(%esi,%edx),%xmm4
 469         shufpd  $01,%xmm3,%xmm2
 470
 471         movapd  %xmm0,(%edi,%edx)
 472         shufpd  $01,%xmm4,%xmm3
 473         movapd  %xmm1,16(%edi,%edx)
 474         movapd  %xmm2,32(%edi,%edx)
 475         movapd  %xmm4,%xmm0
 476         movapd  %xmm3,48(%edi,%edx)
 477
 478         addl    $64,%edx
 479         jnz     1b
 480
 481         jmp     Lshort                  // copy remaining 0..63 bytes and done
 482
 483
 484 // Forward loop for medium length operands in which low four bits of %esi == 1001
 485
 486 LMod9:
 487         movdqa  -9(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 488 1:                                      // loop over 64-byte chunks
 489         movdqa  7(%esi,%edx),%xmm1
 490         movdqa  23(%esi,%edx),%xmm2
 491         movdqa  39(%esi,%edx),%xmm3
 492         movdqa  55(%esi,%edx),%xmm4
 493
 494         movdqa  %xmm0,%xmm5
 495         movdqa  %xmm4,%xmm0
 496
 497         palignr $9,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 498         palignr $9,%xmm2,%xmm3
 499         palignr $9,%xmm1,%xmm2
 500         palignr $9,%xmm5,%xmm1
 501
 502         movdqa  %xmm1,(%edi,%edx)
 503         movdqa  %xmm2,16(%edi,%edx)
 504         movdqa  %xmm3,32(%edi,%edx)
 505         movdqa  %xmm4,48(%edi,%edx)
 506
 507         addl    $64,%edx
 508         jnz     1b
 509
 510         jmp     Lshort                  // copy remaining 0..63 bytes and done
 511
 512
 513 // Forward loop for medium length operands in which low four bits of %esi == 1010
 514
 515 LMod10:
 516         movdqa  -10(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 517 1:                                      // loop over 64-byte chunks
 518         movdqa  6(%esi,%edx),%xmm1
 519         movdqa  22(%esi,%edx),%xmm2
 520         movdqa  38(%esi,%edx),%xmm3
 521         movdqa  54(%esi,%edx),%xmm4
 522
 523         movdqa  %xmm0,%xmm5
 524         movdqa  %xmm4,%xmm0
 525
 526         palignr $10,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 527         palignr $10,%xmm2,%xmm3
 528         palignr $10,%xmm1,%xmm2
 529         palignr $10,%xmm5,%xmm1
 530
 531         movdqa  %xmm1,(%edi,%edx)
 532         movdqa  %xmm2,16(%edi,%edx)
 533         movdqa  %xmm3,32(%edi,%edx)
 534         movdqa  %xmm4,48(%edi,%edx)
 535
 536         addl    $64,%edx
 537         jnz     1b
 538
 539         jmp     Lshort                  // copy remaining 0..63 bytes and done
 540
 541
 542 // Forward loop for medium length operands in which low four bits of %esi == 1011
 543
 544 LMod11:
 545         movdqa  -11(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 546 1:                                      // loop over 64-byte chunks
 547         movdqa  5(%esi,%edx),%xmm1
 548         movdqa  21(%esi,%edx),%xmm2
 549         movdqa  37(%esi,%edx),%xmm3
 550         movdqa  53(%esi,%edx),%xmm4
 551
 552         movdqa  %xmm0,%xmm5
 553         movdqa  %xmm4,%xmm0
 554
 555         palignr $11,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 556         palignr $11,%xmm2,%xmm3
 557         palignr $11,%xmm1,%xmm2
 558         palignr $11,%xmm5,%xmm1
 559
 560         movdqa  %xmm1,(%edi,%edx)
 561         movdqa  %xmm2,16(%edi,%edx)
 562         movdqa  %xmm3,32(%edi,%edx)
 563         movdqa  %xmm4,48(%edi,%edx)
 564
 565         addl    $64,%edx
 566         jnz     1b
 567
 568         jmp     Lshort                  // copy remaining 0..63 bytes and done
 569
 570
 571 // Forward loop for medium length operands in which low four bits of %esi == 1100
 572 // We use the float single data type in order to use "movss" to merge vectors.
 573
 574 LMod12:
 575         movss   (%esi,%edx),%xmm0       // prefetch 1st four bytes of source, right justified
 576         jmp     1f
 577         .align  4,0x90
 578 1:                                      // loop over 64-byte chunks
 579         pshufd  $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
 580         pshufd  $(0x93),20(%esi,%edx),%xmm2
 581         pshufd  $(0x93),36(%esi,%edx),%xmm3
 582         pshufd  $(0x93),52(%esi,%edx),%xmm4
 583
 584         movaps  %xmm4,%xmm5
 585         movss   %xmm3,%xmm4             // copy low 4 bytes of source into destination
 586         movss   %xmm2,%xmm3
 587         movss   %xmm1,%xmm2
 588         movss   %xmm0,%xmm1
 589
 590         movaps  %xmm1,(%edi,%edx)
 591         movaps  %xmm2,16(%edi,%edx)
 592         movaps  %xmm5,%xmm0
 593         movaps  %xmm3,32(%edi,%edx)
 594         movaps  %xmm4,48(%edi,%edx)
 595
 596         addl    $64,%edx
 597         jnz     1b
 598
 599         jmp     Lshort                  // copy remaining 0..63 bytes and done
 600
 601
 602 // Forward loop for medium length operands in which low four bits of %esi == 1101
 603
 604 LMod13:
 605         movdqa  -13(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 606 1:                                      // loop over 64-byte chunks
 607         movdqa  3(%esi,%edx),%xmm1
 608         movdqa  19(%esi,%edx),%xmm2
 609         movdqa  35(%esi,%edx),%xmm3
 610         movdqa  51(%esi,%edx),%xmm4
 611
 612         movdqa  %xmm0,%xmm5
 613         movdqa  %xmm4,%xmm0
 614
 615         palignr $13,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 616         palignr $13,%xmm2,%xmm3
 617         palignr $13,%xmm1,%xmm2
 618         palignr $13,%xmm5,%xmm1
 619
 620         movdqa  %xmm1,(%edi,%edx)
 621         movdqa  %xmm2,16(%edi,%edx)
 622         movdqa  %xmm3,32(%edi,%edx)
 623         movdqa  %xmm4,48(%edi,%edx)
 624
 625         addl    $64,%edx
 626         jnz     1b
 627
 628         jmp     Lshort                  // copy remaining 0..63 bytes and done
 629
 630
 631 // Forward loop for medium length operands in which low four bits of %esi == 1110
 632
 633 LMod14:
 634         movdqa  -14(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 635 1:                                      // loop over 64-byte chunks
 636         movdqa  2(%esi,%edx),%xmm1
 637         movdqa  18(%esi,%edx),%xmm2
 638         movdqa  34(%esi,%edx),%xmm3
 639         movdqa  50(%esi,%edx),%xmm4
 640
 641         movdqa  %xmm0,%xmm5
 642         movdqa  %xmm4,%xmm0
 643
 644         palignr $14,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 645         palignr $14,%xmm2,%xmm3
 646         palignr $14,%xmm1,%xmm2
 647         palignr $14,%xmm5,%xmm1
 648
 649         movdqa  %xmm1,(%edi,%edx)
 650         movdqa  %xmm2,16(%edi,%edx)
 651         movdqa  %xmm3,32(%edi,%edx)
 652         movdqa  %xmm4,48(%edi,%edx)
 653
 654         addl    $64,%edx
 655         jnz     1b
 656
 657         jmp     Lshort                  // copy remaining 0..63 bytes and done
 658
 659
 660 // Forward loop for medium length operands in which low four bits of %esi == 1111
 661
 662 LMod15:
 663         movdqa  -15(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 664 1:                                      // loop over 64-byte chunks
 665         movdqa  1(%esi,%edx),%xmm1
 666         movdqa  17(%esi,%edx),%xmm2
 667         movdqa  33(%esi,%edx),%xmm3
 668         movdqa  49(%esi,%edx),%xmm4
 669
 670         movdqa  %xmm0,%xmm5
 671         movdqa  %xmm4,%xmm0
 672
 673         palignr $15,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 674         palignr $15,%xmm2,%xmm3
 675         palignr $15,%xmm1,%xmm2
 676         palignr $15,%xmm5,%xmm1
 677
 678         movdqa  %xmm1,(%edi,%edx)
 679         movdqa  %xmm2,16(%edi,%edx)
 680         movdqa  %xmm3,32(%edi,%edx)
 681         movdqa  %xmm4,48(%edi,%edx)
 682
 683         addl    $64,%edx
 684         jnz     1b
 685
 686         jmp     Lshort                  // copy remaining 0..63 bytes and done
 687
 688
 689 // Reverse moves.  These are not optimized as aggressively as their forward
 690 // counterparts, as they are only used with destructive overlap.
 691 //      ecx = length
 692 //      esi = source ptr
 693 //      edi = dest ptr
 694
 695 LReverse:
 696         addl    %ecx,%esi               // point to end of strings
 697         addl    %ecx,%edi
 698         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
 699         ja      LReverseNotShort        // yes
 700
 701 // Handle reverse short copies.
 702 //      ecx = length
 703 //      esi = one byte past end of source
 704 //      edi = one byte past end of dest
 705
 706 LReverseShort:
 707         movl    %ecx,%edx               // copy length
 708         shrl    $2,%ecx                 // #words
 709         jz      3f
 710 1:
 711         subl    $4,%esi
 712         movl    (%esi),%eax
 713         subl    $4,%edi
 714         movl    %eax,(%edi)
 715         dec     %ecx
 716         jnz     1b
 717 3:
 718         andl    $3,%edx                 // bytes?
 719         jz      5f
 720 4:
 721         dec     %esi
 722         movb    (%esi),%al
 723         dec     %edi
 724         movb    %al,(%edi)
 725         dec     %edx
 726         jnz     4b
 727 5:
 728         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 729         popl    %edi
 730         popl    %esi
 731         popl    %ebp
 732         ret
 733
 734 // Handle a reverse move long enough to justify using SSE.
 735 //      ecx = length
 736 //      esi = one byte past end of source
 737 //      edi = one byte past end of dest
 738
 739 LReverseNotShort:
 740         movl    %edi,%edx               // copy destination
 741         andl    $15,%edx                // get #bytes to align destination
 742         je      LReverseDestAligned     // already aligned
 743         subl    %edx,%ecx               // adjust length
 744 1:                                      // loop copying 1..15 bytes
 745         dec     %esi
 746         movb    (%esi),%al
 747         dec     %edi
 748         movb    %al,(%edi)
 749         dec     %edx
 750         jnz     1b
 751
 752 // Destination is now aligned.  Prepare for reverse loops.
 753
 754 LReverseDestAligned:
 755         movl    %ecx,%edx               // copy length
 756         andl    $63,%ecx                // get remaining bytes for Lshort
 757         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 758         subl    %edx,%esi               // point to endpoint of copy
 759         subl    %edx,%edi
 760         testl   $15,%esi                // is source aligned too?
 761         jnz     LReverseUnalignedLoop   // no
 762
 763 LReverseAlignedLoop:                    // loop over 64-byte chunks
 764         movdqa  -16(%esi,%edx),%xmm0
 765         movdqa  -32(%esi,%edx),%xmm1
 766         movdqa  -48(%esi,%edx),%xmm2
 767         movdqa  -64(%esi,%edx),%xmm3
 768
 769         movdqa  %xmm0,-16(%edi,%edx)
 770         movdqa  %xmm1,-32(%edi,%edx)
 771         movdqa  %xmm2,-48(%edi,%edx)
 772         movdqa  %xmm3,-64(%edi,%edx)
 773
 774         subl    $64,%edx
 775         jne     LReverseAlignedLoop
 776
 777         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 778
 779
 780 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 781
 782 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 783         movdqu  -16(%esi,%edx),%xmm0
 784         movdqu  -32(%esi,%edx),%xmm1
 785         movdqu  -48(%esi,%edx),%xmm2
 786         movdqu  -64(%esi,%edx),%xmm3
 787
 788         movdqa  %xmm0,-16(%edi,%edx)
 789         movdqa  %xmm1,-32(%edi,%edx)
 790         movdqa  %xmm2,-48(%edi,%edx)
 791         movdqa  %xmm3,-64(%edi,%edx)
 792
 793         subl    $64,%edx
 794         jne     LReverseUnalignedLoop
 795
 796         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 797
 798
 799         COMMPAGE_DESCRIPTOR(bcopy_sse4,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)