osfmk/i386/commpage/bcopy_sse4.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE4
  34  * and 64-byte cache lines.
  35  *
  36  * The following #defines are tightly coupled to the u-architecture:
  37  */
  38
  39 #define kShort  80                      // too short to bother with SSE (must be >=80)
  40 #define kVeryLong   (500*1024)          // large enough for non-temporal stores (must be >= 8192)
  41 #define kFastUCode  ((16*1024)-15)      // cutoff for microcode fastpath for "rep/movsl"
  42
  43
  44 // void bcopy(const void *src, void *dst, size_t len);
  45
  46         .text
  47         .align  5, 0x90
  48 LZero:
  49 Lbcopy_sse4:                            // void bcopy(const void *src, void *dst, size_t len)
  50         pushl   %ebp                    // set up a frame for backtraces
  51         movl    %esp,%ebp
  52         pushl   %esi
  53         pushl   %edi
  54         movl    8(%ebp),%esi            // get source ptr
  55         movl    12(%ebp),%edi           // get dest ptr
  56         movl    16(%ebp),%ecx           // get length
  57         movl    %edi,%edx
  58         subl    %esi,%edx               // (dest - source)
  59         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  60         jb      LReverseIsland
  61         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  62         jbe     Lshort                  // no
  63         jmp     LNotShort
  64
  65 //
  66 // void *memcpy(void *dst, const void *src, size_t len);
  67 // void *memmove(void *dst, const void *src, size_t len);
  68 //
  69 // NB: These need to be 32 bytes from bcopy():
  70 //
  71
  72         .align  5, 0x90
  73 Lmemcpy:                                // void *memcpy(void *dst, const void *src, size_t len)
  74 Lmemmove:                               // void *memmove(void *dst, const void *src, size_t len)
  75         pushl   %ebp                    // set up a frame for backtraces
  76         movl    %esp,%ebp
  77         pushl   %esi
  78         pushl   %edi
  79         movl    8(%ebp),%edi            // get dest ptr
  80         movl    12(%ebp),%esi           // get source ptr
  81         movl    16(%ebp),%ecx           // get length
  82         movl    %edi,%edx
  83         subl    %esi,%edx               // (dest - source)
  84         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  85         jb      LReverseIsland
  86         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  87         ja      LNotShort               // yes
  88
  89 // Handle short forward copies.  As the most common case, this is the fall-through path.
  90 //      ecx = length (<= kShort)
  91 //      esi = source ptr
  92 //      edi = dest ptr
  93
  94 Lshort:
  95         movl    %ecx,%edx               // copy length
  96         shrl    $2,%ecx                 // get #doublewords
  97         jz      LLeftovers
  98 2:                                      // loop copying doublewords
  99         movl    (%esi),%eax
 100         addl    $4,%esi
 101         movl    %eax,(%edi)
 102         addl    $4,%edi
 103         dec     %ecx
 104         jnz     2b
 105 LLeftovers:                             // handle leftover bytes (0..3) in last word
 106         andl    $3,%edx                 // any leftover bytes?
 107         jz      Lexit
 108 4:                                      // loop copying bytes
 109         movb    (%esi),%al
 110         inc     %esi
 111         movb    %al,(%edi)
 112         inc     %edi
 113         dec     %edx
 114         jnz     4b
 115 Lexit:
 116         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 117         popl    %edi
 118         popl    %esi
 119         popl    %ebp
 120         ret
 121
 122
 123 LReverseIsland:                         // keep the "jb" above a short branch...
 124         jmp     LReverse                // ...because reverse moves are uncommon
 125
 126
 127 // Handle forward moves that are long enough to justify use of SSE3.
 128 // First, 16-byte align the destination.
 129 //      ecx = length (> kShort)
 130 //      esi = source ptr
 131 //      edi = dest ptr
 132
 133 LNotShort:
 134         cmpl    $(kVeryLong),%ecx       // long enough to justify heavyweight loops?
 135         movl    %edi,%edx               // copy destination
 136         jae     LVeryLong               // use very-long-operand path
 137         negl    %edx
 138         andl    $15,%edx                // get #bytes to align destination
 139         jz      LDestAligned            // already aligned
 140         subl    %edx,%ecx               // decrement length
 141 1:                                      // loop copying 1..15 bytes
 142         movb    (%esi),%al
 143         inc     %esi
 144         movb    %al,(%edi)
 145         inc     %edi
 146         dec     %edx
 147         jnz     1b
 148
 149 // Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
 150 // based on the alignment of the source.  All vector loads and stores are aligned.
 151 // Even though this means we have to shift and repack vectors, doing so is much faster
 152 // than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
 153 // there is at least one chunk.  When we enter the copy loops, the following registers
 154 // are set up:
 155 //      ecx = residual length (0..63)
 156 //      edx = -(length to move), a multiple of 64
 157 //      esi = ptr to 1st source byte not to move (unaligned)
 158 //      edi = ptr to 1st dest byte not to move (aligned)
 159
 160 LDestAligned:
 161         movl    %ecx,%edx               // copy length
 162         movl    %esi,%eax               // copy source address
 163         andl    $63,%ecx                // get remaining bytes for Lshort
 164         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 165         andl    $15,%eax                // mask to low 4 bits of source address
 166         addl    %edx,%esi               // point to 1st byte not copied
 167         addl    %edx,%edi
 168         negl    %edx                    // now generate offset to 1st byte to be copied
 169         movl    (_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax
 170         jmp     *%eax
 171
 172         .align  2
 173 LTable:                                 // table of copy loop addresses
 174         .long   LMod0 + _COMM_PAGE_BCOPY - LZero
 175         .long   LMod1 + _COMM_PAGE_BCOPY - LZero
 176         .long   LMod2 + _COMM_PAGE_BCOPY - LZero
 177         .long   LMod3 + _COMM_PAGE_BCOPY - LZero
 178         .long   LMod4 + _COMM_PAGE_BCOPY - LZero
 179         .long   LMod5 + _COMM_PAGE_BCOPY - LZero
 180         .long   LMod6 + _COMM_PAGE_BCOPY - LZero
 181         .long   LMod7 + _COMM_PAGE_BCOPY - LZero
 182         .long   LMod8 + _COMM_PAGE_BCOPY - LZero
 183         .long   LMod9 + _COMM_PAGE_BCOPY - LZero
 184         .long   LMod10 + _COMM_PAGE_BCOPY - LZero
 185         .long   LMod11 + _COMM_PAGE_BCOPY - LZero
 186         .long   LMod12 + _COMM_PAGE_BCOPY - LZero
 187         .long   LMod13 + _COMM_PAGE_BCOPY - LZero
 188         .long   LMod14 + _COMM_PAGE_BCOPY - LZero
 189         .long   LMod15 + _COMM_PAGE_BCOPY - LZero
 190
 191
 192 // Very long forward moves.  These are at least several pages.  They are special cased
 193 // and aggressively optimized, not so much because they are common or useful, but
 194 // because they are subject to benchmark.  There isn't enough room for them in the
 195 // area reserved on the commpage for bcopy, so we put them elsewhere.  We call
 196 // the longcopy routine using the normal ABI.
 197
 198 LVeryLong:
 199         pushl   %ecx                    // length (>= kVeryLong)
 200         pushl   %esi                    // source ptr
 201         pushl   %edi                    // dest ptr
 202         movl    $(_COMM_PAGE_LONGCOPY),%eax
 203         call    *%eax                   // do the long copy
 204         addl    $12,%esp                // pop off our parameters
 205         jmp     Lexit
 206
 207
 208 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
 209 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
 210 // about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
 211 // avoids having to read destination cache lines that will be completely overwritten.
 212 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
 213 // we do not know if the destination is in cache or not.
 214
 215 Lfastpath:
 216         addl    %edx,%esi               // restore ptrs to 1st byte of source and dest
 217         addl    %edx,%edi
 218         negl    %edx                    // make length positive
 219         orl     %edx,%ecx               // restore total #bytes remaining to move
 220         cld                             // we'll move forward
 221         movl    %ecx,%edx               // copy total length to move
 222         shrl    $2,%ecx                 // compute #words to move
 223         rep                             // the u-code will optimize this
 224         movsl
 225         jmp     LLeftovers              // handle 0..3 leftover bytes
 226
 227
 228 // Forward loop for medium length operands in which low four bits of %esi == 0000
 229
 230 LMod0:
 231         cmpl    $(-kFastUCode),%edx     // %edx == -length, where (length < kVeryLong)
 232         jle     Lfastpath               // long enough for fastpath in microcode
 233         jmp     1f
 234         .align  4,0x90                  // 16-byte align inner loops
 235 1:                                      // loop over 64-byte chunks
 236         movdqa  (%esi,%edx),%xmm0
 237         movdqa  16(%esi,%edx),%xmm1
 238         movdqa  32(%esi,%edx),%xmm2
 239         movdqa  48(%esi,%edx),%xmm3
 240
 241         movdqa  %xmm0,(%edi,%edx)
 242         movdqa  %xmm1,16(%edi,%edx)
 243         movdqa  %xmm2,32(%edi,%edx)
 244         movdqa  %xmm3,48(%edi,%edx)
 245
 246         addl    $64,%edx
 247         jnz     1b
 248
 249         jmp     Lshort                  // copy remaining 0..63 bytes and done
 250
 251
 252 // Forward loop for medium length operands in which low four bits of %esi == 0001
 253
 254 LMod1:
 255         movdqa  -1(%esi,%edx),%xmm0     // prime the loop by loading 1st quadword
 256 1:                                      // loop over 64-byte chunks
 257         movdqa  15(%esi,%edx),%xmm1
 258         movdqa  31(%esi,%edx),%xmm2
 259         movdqa  47(%esi,%edx),%xmm3
 260         movdqa  63(%esi,%edx),%xmm4
 261
 262         movdqa  %xmm0,%xmm5
 263         movdqa  %xmm4,%xmm0
 264
 265         palignr $1,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 266         palignr $1,%xmm2,%xmm3
 267         palignr $1,%xmm1,%xmm2
 268         palignr $1,%xmm5,%xmm1
 269
 270         movdqa  %xmm1,(%edi,%edx)
 271         movdqa  %xmm2,16(%edi,%edx)
 272         movdqa  %xmm3,32(%edi,%edx)
 273         movdqa  %xmm4,48(%edi,%edx)
 274
 275         addl    $64,%edx
 276         jnz     1b
 277
 278         jmp     Lshort                  // copy remaining 0..63 bytes and done
 279
 280
 281 // Forward loop for medium length operands in which low four bits of %esi == 0010
 282
 283 LMod2:
 284         movdqa  -2(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 285 1:                                      // loop over 64-byte chunks
 286         movdqa  14(%esi,%edx),%xmm1
 287         movdqa  30(%esi,%edx),%xmm2
 288         movdqa  46(%esi,%edx),%xmm3
 289         movdqa  62(%esi,%edx),%xmm4
 290
 291         movdqa  %xmm0,%xmm5
 292         movdqa  %xmm4,%xmm0
 293
 294         palignr $2,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 295         palignr $2,%xmm2,%xmm3
 296         palignr $2,%xmm1,%xmm2
 297         palignr $2,%xmm5,%xmm1
 298
 299         movdqa  %xmm1,(%edi,%edx)
 300         movdqa  %xmm2,16(%edi,%edx)
 301         movdqa  %xmm3,32(%edi,%edx)
 302         movdqa  %xmm4,48(%edi,%edx)
 303
 304         addl    $64,%edx
 305         jnz     1b
 306
 307         jmp     Lshort                  // copy remaining 0..63 bytes and done
 308
 309
 310 // Forward loop for medium length operands in which low four bits of %esi == 0011
 311
 312 LMod3:
 313         movdqa  -3(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 314 1:                                      // loop over 64-byte chunks
 315         movdqa  13(%esi,%edx),%xmm1
 316         movdqa  29(%esi,%edx),%xmm2
 317         movdqa  45(%esi,%edx),%xmm3
 318         movdqa  61(%esi,%edx),%xmm4
 319
 320         movdqa  %xmm0,%xmm5
 321         movdqa  %xmm4,%xmm0
 322
 323         palignr $3,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 324         palignr $3,%xmm2,%xmm3
 325         palignr $3,%xmm1,%xmm2
 326         palignr $3,%xmm5,%xmm1
 327
 328         movdqa  %xmm1,(%edi,%edx)
 329         movdqa  %xmm2,16(%edi,%edx)
 330         movdqa  %xmm3,32(%edi,%edx)
 331         movdqa  %xmm4,48(%edi,%edx)
 332
 333         addl    $64,%edx
 334         jnz     1b
 335
 336         jmp     Lshort                  // copy remaining 0..63 bytes and done
 337
 338
 339 // Forward loop for medium length operands in which low four bits of %esi == 0100
 340 // We use the float single data type in order to use "movss" to merge vectors.
 341
 342 LMod4:
 343         movaps  -4(%esi,%edx),%xmm0     // 4-byte aligned: prime the loop
 344         jmp     1f
 345         .align  4,0x90
 346 1:                                      // loop over 64-byte chunks
 347         movaps  12(%esi,%edx),%xmm1
 348         movaps  28(%esi,%edx),%xmm2
 349         movss   %xmm1,%xmm0             // copy low 4 bytes of source into destination
 350         pshufd  $(0x39),%xmm0,%xmm0     // rotate right 4 bytes (mask -- 00 11 10 01)
 351         movaps  44(%esi,%edx),%xmm3
 352         movss   %xmm2,%xmm1
 353         pshufd  $(0x39),%xmm1,%xmm1
 354         movaps  60(%esi,%edx),%xmm4
 355         movss   %xmm3,%xmm2
 356         pshufd  $(0x39),%xmm2,%xmm2
 357
 358         movaps  %xmm0,(%edi,%edx)
 359         movss   %xmm4,%xmm3
 360         pshufd  $(0x39),%xmm3,%xmm3
 361         movaps  %xmm1,16(%edi,%edx)
 362         movaps  %xmm2,32(%edi,%edx)
 363         movaps  %xmm4,%xmm0
 364         movaps  %xmm3,48(%edi,%edx)
 365
 366         addl    $64,%edx
 367         jnz     1b
 368
 369         jmp     Lshort                  // copy remaining 0..63 bytes and done
 370
 371
 372 // Forward loop for medium length operands in which low four bits of %esi == 0101
 373
 374 LMod5:
 375         movdqa  -5(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 376 1:                                      // loop over 64-byte chunks
 377         movdqa  11(%esi,%edx),%xmm1
 378         movdqa  27(%esi,%edx),%xmm2
 379         movdqa  43(%esi,%edx),%xmm3
 380         movdqa  59(%esi,%edx),%xmm4
 381
 382         movdqa  %xmm0,%xmm5
 383         movdqa  %xmm4,%xmm0
 384
 385         palignr $5,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 386         palignr $5,%xmm2,%xmm3
 387         palignr $5,%xmm1,%xmm2
 388         palignr $5,%xmm5,%xmm1
 389
 390         movdqa  %xmm1,(%edi,%edx)
 391         movdqa  %xmm2,16(%edi,%edx)
 392         movdqa  %xmm3,32(%edi,%edx)
 393         movdqa  %xmm4,48(%edi,%edx)
 394
 395         addl    $64,%edx
 396         jnz     1b
 397
 398         jmp     Lshort                  // copy remaining 0..63 bytes and done
 399
 400
 401 // Forward loop for medium length operands in which low four bits of %esi == 0110
 402
 403 LMod6:
 404         movdqa  -6(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 405 1:                                      // loop over 64-byte chunks
 406         movdqa  10(%esi,%edx),%xmm1
 407         movdqa  26(%esi,%edx),%xmm2
 408         movdqa  42(%esi,%edx),%xmm3
 409         movdqa  58(%esi,%edx),%xmm4
 410
 411         movdqa  %xmm0,%xmm5
 412         movdqa  %xmm4,%xmm0
 413
 414         palignr $6,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 415         palignr $6,%xmm2,%xmm3
 416         palignr $6,%xmm1,%xmm2
 417         palignr $6,%xmm5,%xmm1
 418
 419         movdqa  %xmm1,(%edi,%edx)
 420         movdqa  %xmm2,16(%edi,%edx)
 421         movdqa  %xmm3,32(%edi,%edx)
 422         movdqa  %xmm4,48(%edi,%edx)
 423
 424         addl    $64,%edx
 425         jnz     1b
 426
 427         jmp     Lshort                  // copy remaining 0..63 bytes and done
 428
 429
 430 // Forward loop for medium length operands in which low four bits of %esi == 0111
 431
 432 LMod7:
 433         movdqa  -7(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 434 1:                                      // loop over 64-byte chunks
 435         movdqa  9(%esi,%edx),%xmm1
 436         movdqa  25(%esi,%edx),%xmm2
 437         movdqa  41(%esi,%edx),%xmm3
 438         movdqa  57(%esi,%edx),%xmm4
 439
 440         movdqa  %xmm0,%xmm5
 441         movdqa  %xmm4,%xmm0
 442
 443         palignr $7,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 444         palignr $7,%xmm2,%xmm3
 445         palignr $7,%xmm1,%xmm2
 446         palignr $7,%xmm5,%xmm1
 447
 448         movdqa  %xmm1,(%edi,%edx)
 449         movdqa  %xmm2,16(%edi,%edx)
 450         movdqa  %xmm3,32(%edi,%edx)
 451         movdqa  %xmm4,48(%edi,%edx)
 452
 453         addl    $64,%edx
 454         jnz     1b
 455
 456         jmp     Lshort                  // copy remaining 0..63 bytes and done
 457
 458
 459 // Forward loop for medium length operands in which low four bits of %esi == 1000
 460 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
 461
 462 LMod8:
 463         cmpl    $(-kFastUCode),%edx     // %edx == -length, where (length < kVeryLong)
 464         jle     Lfastpath               // long enough for fastpath in microcode
 465         movapd  -8(%esi,%edx),%xmm0     // 8-byte aligned: prime the loop
 466         jmp     1f
 467         .align  4,0x90
 468 1:                                      // loop over 64-byte chunks
 469         movapd  8(%esi,%edx),%xmm1
 470         movapd  24(%esi,%edx),%xmm2
 471         shufpd  $01,%xmm1,%xmm0         // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
 472         movapd  40(%esi,%edx),%xmm3
 473         shufpd  $01,%xmm2,%xmm1
 474         movapd  56(%esi,%edx),%xmm4
 475         shufpd  $01,%xmm3,%xmm2
 476
 477         movapd  %xmm0,(%edi,%edx)
 478         shufpd  $01,%xmm4,%xmm3
 479         movapd  %xmm1,16(%edi,%edx)
 480         movapd  %xmm2,32(%edi,%edx)
 481         movapd  %xmm4,%xmm0
 482         movapd  %xmm3,48(%edi,%edx)
 483
 484         addl    $64,%edx
 485         jnz     1b
 486
 487         jmp     Lshort                  // copy remaining 0..63 bytes and done
 488
 489
 490 // Forward loop for medium length operands in which low four bits of %esi == 1001
 491
 492 LMod9:
 493         movdqa  -9(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 494 1:                                      // loop over 64-byte chunks
 495         movdqa  7(%esi,%edx),%xmm1
 496         movdqa  23(%esi,%edx),%xmm2
 497         movdqa  39(%esi,%edx),%xmm3
 498         movdqa  55(%esi,%edx),%xmm4
 499
 500         movdqa  %xmm0,%xmm5
 501         movdqa  %xmm4,%xmm0
 502
 503         palignr $9,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 504         palignr $9,%xmm2,%xmm3
 505         palignr $9,%xmm1,%xmm2
 506         palignr $9,%xmm5,%xmm1
 507
 508         movdqa  %xmm1,(%edi,%edx)
 509         movdqa  %xmm2,16(%edi,%edx)
 510         movdqa  %xmm3,32(%edi,%edx)
 511         movdqa  %xmm4,48(%edi,%edx)
 512
 513         addl    $64,%edx
 514         jnz     1b
 515
 516         jmp     Lshort                  // copy remaining 0..63 bytes and done
 517
 518
 519 // Forward loop for medium length operands in which low four bits of %esi == 1010
 520
 521 LMod10:
 522         movdqa  -10(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 523 1:                                      // loop over 64-byte chunks
 524         movdqa  6(%esi,%edx),%xmm1
 525         movdqa  22(%esi,%edx),%xmm2
 526         movdqa  38(%esi,%edx),%xmm3
 527         movdqa  54(%esi,%edx),%xmm4
 528
 529         movdqa  %xmm0,%xmm5
 530         movdqa  %xmm4,%xmm0
 531
 532         palignr $10,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 533         palignr $10,%xmm2,%xmm3
 534         palignr $10,%xmm1,%xmm2
 535         palignr $10,%xmm5,%xmm1
 536
 537         movdqa  %xmm1,(%edi,%edx)
 538         movdqa  %xmm2,16(%edi,%edx)
 539         movdqa  %xmm3,32(%edi,%edx)
 540         movdqa  %xmm4,48(%edi,%edx)
 541
 542         addl    $64,%edx
 543         jnz     1b
 544
 545         jmp     Lshort                  // copy remaining 0..63 bytes and done
 546
 547
 548 // Forward loop for medium length operands in which low four bits of %esi == 1011
 549
 550 LMod11:
 551         movdqa  -11(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 552 1:                                      // loop over 64-byte chunks
 553         movdqa  5(%esi,%edx),%xmm1
 554         movdqa  21(%esi,%edx),%xmm2
 555         movdqa  37(%esi,%edx),%xmm3
 556         movdqa  53(%esi,%edx),%xmm4
 557
 558         movdqa  %xmm0,%xmm5
 559         movdqa  %xmm4,%xmm0
 560
 561         palignr $11,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 562         palignr $11,%xmm2,%xmm3
 563         palignr $11,%xmm1,%xmm2
 564         palignr $11,%xmm5,%xmm1
 565
 566         movdqa  %xmm1,(%edi,%edx)
 567         movdqa  %xmm2,16(%edi,%edx)
 568         movdqa  %xmm3,32(%edi,%edx)
 569         movdqa  %xmm4,48(%edi,%edx)
 570
 571         addl    $64,%edx
 572         jnz     1b
 573
 574         jmp     Lshort                  // copy remaining 0..63 bytes and done
 575
 576
 577 // Forward loop for medium length operands in which low four bits of %esi == 1100
 578 // We use the float single data type in order to use "movss" to merge vectors.
 579
 580 LMod12:
 581         movss   (%esi,%edx),%xmm0       // prefetch 1st four bytes of source, right justified
 582         jmp     1f
 583         .align  4,0x90
 584 1:                                      // loop over 64-byte chunks
 585         pshufd  $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
 586         pshufd  $(0x93),20(%esi,%edx),%xmm2
 587         pshufd  $(0x93),36(%esi,%edx),%xmm3
 588         pshufd  $(0x93),52(%esi,%edx),%xmm4
 589
 590         movaps  %xmm4,%xmm5
 591         movss   %xmm3,%xmm4             // copy low 4 bytes of source into destination
 592         movss   %xmm2,%xmm3
 593         movss   %xmm1,%xmm2
 594         movss   %xmm0,%xmm1
 595
 596         movaps  %xmm1,(%edi,%edx)
 597         movaps  %xmm2,16(%edi,%edx)
 598         movaps  %xmm5,%xmm0
 599         movaps  %xmm3,32(%edi,%edx)
 600         movaps  %xmm4,48(%edi,%edx)
 601
 602         addl    $64,%edx
 603         jnz     1b
 604
 605         jmp     Lshort                  // copy remaining 0..63 bytes and done
 606
 607
 608 // Forward loop for medium length operands in which low four bits of %esi == 1101
 609
 610 LMod13:
 611         movdqa  -13(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 612 1:                                      // loop over 64-byte chunks
 613         movdqa  3(%esi,%edx),%xmm1
 614         movdqa  19(%esi,%edx),%xmm2
 615         movdqa  35(%esi,%edx),%xmm3
 616         movdqa  51(%esi,%edx),%xmm4
 617
 618         movdqa  %xmm0,%xmm5
 619         movdqa  %xmm4,%xmm0
 620
 621         palignr $13,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 622         palignr $13,%xmm2,%xmm3
 623         palignr $13,%xmm1,%xmm2
 624         palignr $13,%xmm5,%xmm1
 625
 626         movdqa  %xmm1,(%edi,%edx)
 627         movdqa  %xmm2,16(%edi,%edx)
 628         movdqa  %xmm3,32(%edi,%edx)
 629         movdqa  %xmm4,48(%edi,%edx)
 630
 631         addl    $64,%edx
 632         jnz     1b
 633
 634         jmp     Lshort                  // copy remaining 0..63 bytes and done
 635
 636
 637 // Forward loop for medium length operands in which low four bits of %esi == 1110
 638
 639 LMod14:
 640         movdqa  -14(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 641 1:                                      // loop over 64-byte chunks
 642         movdqa  2(%esi,%edx),%xmm1
 643         movdqa  18(%esi,%edx),%xmm2
 644         movdqa  34(%esi,%edx),%xmm3
 645         movdqa  50(%esi,%edx),%xmm4
 646
 647         movdqa  %xmm0,%xmm5
 648         movdqa  %xmm4,%xmm0
 649
 650         palignr $14,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 651         palignr $14,%xmm2,%xmm3
 652         palignr $14,%xmm1,%xmm2
 653         palignr $14,%xmm5,%xmm1
 654
 655         movdqa  %xmm1,(%edi,%edx)
 656         movdqa  %xmm2,16(%edi,%edx)
 657         movdqa  %xmm3,32(%edi,%edx)
 658         movdqa  %xmm4,48(%edi,%edx)
 659
 660         addl    $64,%edx
 661         jnz     1b
 662
 663         jmp     Lshort                  // copy remaining 0..63 bytes and done
 664
 665
 666 // Forward loop for medium length operands in which low four bits of %esi == 1111
 667
 668 LMod15:
 669         movdqa  -15(%esi,%edx),%xmm0    // prime the loop by loading 1st source dq
 670 1:                                      // loop over 64-byte chunks
 671         movdqa  1(%esi,%edx),%xmm1
 672         movdqa  17(%esi,%edx),%xmm2
 673         movdqa  33(%esi,%edx),%xmm3
 674         movdqa  49(%esi,%edx),%xmm4
 675
 676         movdqa  %xmm0,%xmm5
 677         movdqa  %xmm4,%xmm0
 678
 679         palignr $15,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 680         palignr $15,%xmm2,%xmm3
 681         palignr $15,%xmm1,%xmm2
 682         palignr $15,%xmm5,%xmm1
 683
 684         movdqa  %xmm1,(%edi,%edx)
 685         movdqa  %xmm2,16(%edi,%edx)
 686         movdqa  %xmm3,32(%edi,%edx)
 687         movdqa  %xmm4,48(%edi,%edx)
 688
 689         addl    $64,%edx
 690         jnz     1b
 691
 692         jmp     Lshort                  // copy remaining 0..63 bytes and done
 693
 694
 695 // Reverse moves.  These are not optimized as aggressively as their forward
 696 // counterparts, as they are only used with destructive overlap.
 697 //      ecx = length
 698 //      esi = source ptr
 699 //      edi = dest ptr
 700
 701 LReverse:
 702         addl    %ecx,%esi               // point to end of strings
 703         addl    %ecx,%edi
 704         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
 705         ja      LReverseNotShort        // yes
 706
 707 // Handle reverse short copies.
 708 //      ecx = length
 709 //      esi = one byte past end of source
 710 //      edi = one byte past end of dest
 711
 712 LReverseShort:
 713         movl    %ecx,%edx               // copy length
 714         shrl    $2,%ecx                 // #words
 715         jz      3f
 716 1:
 717         subl    $4,%esi
 718         movl    (%esi),%eax
 719         subl    $4,%edi
 720         movl    %eax,(%edi)
 721         dec     %ecx
 722         jnz     1b
 723 3:
 724         andl    $3,%edx                 // bytes?
 725         jz      5f
 726 4:
 727         dec     %esi
 728         movb    (%esi),%al
 729         dec     %edi
 730         movb    %al,(%edi)
 731         dec     %edx
 732         jnz     4b
 733 5:
 734         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 735         popl    %edi
 736         popl    %esi
 737         popl    %ebp
 738         ret
 739
 740 // Handle a reverse move long enough to justify using SSE.
 741 //      ecx = length
 742 //      esi = one byte past end of source
 743 //      edi = one byte past end of dest
 744
 745 LReverseNotShort:
 746         movl    %edi,%edx               // copy destination
 747         andl    $15,%edx                // get #bytes to align destination
 748         je      LReverseDestAligned     // already aligned
 749         subl    %edx,%ecx               // adjust length
 750 1:                                      // loop copying 1..15 bytes
 751         dec     %esi
 752         movb    (%esi),%al
 753         dec     %edi
 754         movb    %al,(%edi)
 755         dec     %edx
 756         jnz     1b
 757
 758 // Destination is now aligned.  Prepare for reverse loops.
 759
 760 LReverseDestAligned:
 761         movl    %ecx,%edx               // copy length
 762         andl    $63,%ecx                // get remaining bytes for Lshort
 763         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 764         subl    %edx,%esi               // point to endpoint of copy
 765         subl    %edx,%edi
 766         testl   $15,%esi                // is source aligned too?
 767         jnz     LReverseUnalignedLoop   // no
 768
 769 LReverseAlignedLoop:                    // loop over 64-byte chunks
 770         movdqa  -16(%esi,%edx),%xmm0
 771         movdqa  -32(%esi,%edx),%xmm1
 772         movdqa  -48(%esi,%edx),%xmm2
 773         movdqa  -64(%esi,%edx),%xmm3
 774
 775         movdqa  %xmm0,-16(%edi,%edx)
 776         movdqa  %xmm1,-32(%edi,%edx)
 777         movdqa  %xmm2,-48(%edi,%edx)
 778         movdqa  %xmm3,-64(%edi,%edx)
 779
 780         subl    $64,%edx
 781         jne     LReverseAlignedLoop
 782
 783         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 784
 785
 786 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 787
 788 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 789         movdqu  -16(%esi,%edx),%xmm0
 790         movdqu  -32(%esi,%edx),%xmm1
 791         movdqu  -48(%esi,%edx),%xmm2
 792         movdqu  -64(%esi,%edx),%xmm3
 793
 794         movdqa  %xmm0,-16(%edi,%edx)
 795         movdqa  %xmm1,-32(%edi,%edx)
 796         movdqa  %xmm2,-48(%edi,%edx)
 797         movdqa  %xmm3,-64(%edi,%edx)
 798
 799         subl    $64,%edx
 800         jne     LReverseUnalignedLoop
 801
 802         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 803
 804
 805         COMMPAGE_DESCRIPTOR(bcopy_sse4,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)