osfmk/i386/commpage/bcopy_sse4_64.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
  34  * SSE4 and 64-byte cache lines.  This is the 64-bit version.
  35  *
  36  * The following #defines are tightly coupled to the u-architecture:
  37  */
  38
  39 #define kShort  80                      // too short to bother with SSE (must be >=80)
  40 #define kVeryLong   (500*1024)          // large enough for non-temporal stores (>=8192 and <2GB)
  41 #define kFastUCode  ((16*1024)-15)      // cutoff for microcode fastpath for "rep/movsl"
  42
  43
  44 // void bcopy(const void *src, void *dst, size_t len);
  45
  46         .text
  47         .code64
  48         .align 5, 0x90
  49 LZero:
  50 Lbcopy_sse4_64:                         // void bcopy(const void *src, void *dst, size_t len)
  51         pushq   %rbp                    // set up a frame for backtraces
  52         movq    %rsp,%rbp
  53         movq    %rsi,%rax               // copy dest ptr
  54         movq    %rdi,%rsi               // xchange source and dest ptrs
  55         movq    %rax,%rdi
  56         subq    %rsi,%rax               // (dest - source)
  57         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  58         jb      LReverseIsland
  59         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  60         jbe     LShort                  // no
  61         jmp     LNotShort
  62
  63 //
  64 // void *memcpy(void *dst, const void *src, size_t len);
  65 // void *memmove(void *dst, const void *src, size_t len);
  66 //
  67 // NB: These need to be 32 bytes from bcopy():
  68 //
  69
  70         .align  5, 0x90
  71 Lmemcpy:                                // void *memcpy(void *dst, const void *src, size_t len)
  72 Lmemmove:                               // void *memmove(void *dst, const void *src, size_t len)
  73         pushq   %rbp                    // set up a frame for backtraces
  74         movq    %rsp,%rbp
  75         movq    %rdi,%r11               // save return value here
  76         movq    %rdi,%rax
  77         subq    %rsi,%rax               // (dest - source)
  78         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  79         jb      LReverseIsland
  80         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  81         ja      LNotShort               // yes
  82
  83 // Handle short forward copies.  As the most common case, this is the fall-through path.
  84 //      rdx = length (<= kShort)
  85 //      rsi = source ptr
  86 //      rdi = dest ptr
  87
  88 LShort:
  89         movl    %edx,%ecx               // copy length using 32-bit operation
  90         shrl    $2,%ecx                 // get #doublewords
  91         jz      LLeftovers
  92 2:                                      // loop copying doublewords
  93         movl    (%rsi),%eax
  94         addq    $4,%rsi
  95         movl    %eax,(%rdi)
  96         addq    $4,%rdi
  97         decl    %ecx
  98         jnz     2b
  99 LLeftovers:                             // handle leftover bytes (0..3) in last word
 100         andl    $3,%edx                 // any leftover bytes?
 101         jz      5f
 102 4:                                      // loop copying bytes
 103         movb    (%rsi),%al
 104         incq    %rsi
 105         movb    %al,(%rdi)
 106         incq    %rdi
 107         decl    %edx
 108         jnz     4b
 109 5:
 110         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 111         popq    %rbp
 112         ret
 113
 114
 115 LReverseIsland:                         // keep the "jb" above a short branch...
 116         jmp     LReverse                // ...because reverse moves are uncommon
 117
 118
 119 // Handle forward moves that are long enough to justify use of SSE.
 120 // First, 16-byte align the destination.
 121 //      rdx = length (> kShort)
 122 //      rsi = source ptr
 123 //      rdi = dest ptr
 124
 125 LNotShort:
 126         cmpq    $(kVeryLong),%rdx       // long enough to justify heavyweight loops?
 127         jae     LVeryLong               // use very-long-operand path
 128         movl    %edi,%ecx               // copy low half of destination ptr
 129         negl    %ecx
 130         andl    $15,%ecx                // get #bytes to align destination
 131         jz      LDestAligned            // already aligned
 132         subl    %ecx,%edx               // decrement length
 133         rep                             // align destination
 134         movsb
 135
 136
 137 // Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
 138 // based on the alignment of the source.  All vector loads and stores are aligned.
 139 // Even though this means we have to shift and repack vectors, doing so is much faster
 140 // than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
 141 // there is at least one chunk.  When we enter the copy loops, the following registers
 142 // are set up:
 143 //      rdx = residual length (0..63)
 144 //      rcx = -(length to move), a multiple of 64 less than 2GB
 145 //      rsi = ptr to 1st source byte not to move (unaligned)
 146 //      rdi = ptr to 1st dest byte not to move (aligned)
 147
 148 LDestAligned:
 149         movl    %edx,%ecx               // copy length
 150         movl    %esi,%eax               // copy low half of source address
 151         andl    $63,%edx                // get remaining bytes for LShort
 152         andl    $15,%eax                // mask to low 4 bits of source address
 153         andl    $-64,%ecx               // get number of bytes we will copy in inner loop
 154 // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block.
 155 //      lea     LTable(%rip),%r8        // point to dispatch table
 156         movq    $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528
 157         addq    $(LTable-LZero),%r8     // work around 4586528
 158         addq    %rcx,%rsi               // point to 1st byte not copied
 159         addq    %rcx,%rdi
 160         movl    (%r8,%rax,4),%eax       // get offset of routine
 161         negq    %rcx                    // now generate offset to 1st byte to be copied
 162         addq    %r8,%rax                // generate address of copy loop
 163         jmp     *%rax                   // enter copy loop, selected by source alignment
 164
 165         .align  2
 166 LTable:                                 // table of copy loop addresses
 167         .long   (LMod0 - LTable)
 168         .long   (LMod1 - LTable)
 169         .long   (LMod2 - LTable)
 170         .long   (LMod3 - LTable)
 171         .long   (LMod4 - LTable)
 172         .long   (LMod5 - LTable)
 173         .long   (LMod6 - LTable)
 174         .long   (LMod7 - LTable)
 175         .long   (LMod8 - LTable)
 176         .long   (LMod9 - LTable)
 177         .long   (LMod10 - LTable)
 178         .long   (LMod11 - LTable)
 179         .long   (LMod12 - LTable)
 180         .long   (LMod13 - LTable)
 181         .long   (LMod14 - LTable)
 182         .long   (LMod15 - LTable)
 183
 184
 185 // Very long forward moves.  These are at least several pages.  They are special cased
 186 // and aggressively optimized, not so much because they are common or useful, but
 187 // because they are subject to benchmark.  There isn't enough room for them in the
 188 // area reserved on the commpage for bcopy, so we put them elsewhere.  We call
 189 // the longcopy routine using the normal ABI:
 190 //      rdi = dest
 191 //      rsi = source
 192 //      rdx = length (>= kVeryLong bytes)
 193
 194 LVeryLong:
 195         pushq   %r11                    // save return value
 196         movq    $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
 197         call    *%rax                   // call very long operand routine
 198         popq    %rax                    // pop return value
 199         popq    %rbp
 200         ret
 201
 202
 203 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
 204 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
 205 // about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
 206 // avoids having to read destination cache lines that will be completely overwritten.
 207 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
 208 // we do not know if the destination is in cache or not.
 209
 210 Lfastpath:
 211         addq    %rcx,%rsi               // restore ptrs to 1st byte of source and dest
 212         addq    %rcx,%rdi
 213         negl    %ecx                    // make length positive (known to be < 2GB)
 214         orl     %edx,%ecx               // restore total #bytes remaining to move
 215         cld                             // we'll move forward
 216         shrl    $2,%ecx                 // compute #words to move
 217         rep                             // the u-code will optimize this
 218         movsl
 219         jmp     LLeftovers              // handle 0..3 leftover bytes
 220
 221
 222 // Forward loop for medium length operands in which low four bits of %rsi == 0000
 223
 224 LMod0:
 225         cmpl    $(-kFastUCode),%ecx     // %rcx == -length, where (length < kVeryLong)
 226         jle     Lfastpath               // long enough for fastpath in microcode
 227         jmp     1f
 228         .align  4,0x90                  // 16-byte align inner loops
 229 1:                                      // loop over 64-byte chunks
 230         movdqa  (%rsi,%rcx),%xmm0
 231         movdqa  16(%rsi,%rcx),%xmm1
 232         movdqa  32(%rsi,%rcx),%xmm2
 233         movdqa  48(%rsi,%rcx),%xmm3
 234
 235         movdqa  %xmm0,(%rdi,%rcx)
 236         movdqa  %xmm1,16(%rdi,%rcx)
 237         movdqa  %xmm2,32(%rdi,%rcx)
 238         movdqa  %xmm3,48(%rdi,%rcx)
 239
 240         addq    $64,%rcx
 241         jnz     1b
 242
 243         jmp     LShort                  // copy remaining 0..63 bytes and done
 244
 245
 246 // Forward loop for medium length operands in which low four bits of %rsi == 0001
 247
 248 LMod1:
 249         movdqa  -1(%rsi,%rcx),%xmm0     // prime the loop by loading 1st quadword
 250 1:                                      // loop over 64-byte chunks
 251         movdqa  15(%rsi,%rcx),%xmm1
 252         movdqa  31(%rsi,%rcx),%xmm2
 253         movdqa  47(%rsi,%rcx),%xmm3
 254         movdqa  63(%rsi,%rcx),%xmm4
 255
 256         movdqa  %xmm0,%xmm5
 257         movdqa  %xmm4,%xmm0
 258
 259         palignr $1,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 260         palignr $1,%xmm2,%xmm3
 261         palignr $1,%xmm1,%xmm2
 262         palignr $1,%xmm5,%xmm1
 263
 264         movdqa  %xmm1,(%rdi,%rcx)
 265         movdqa  %xmm2,16(%rdi,%rcx)
 266         movdqa  %xmm3,32(%rdi,%rcx)
 267         movdqa  %xmm4,48(%rdi,%rcx)
 268
 269         addq    $64,%rcx
 270         jnz     1b
 271
 272         jmp     LShort                  // copy remaining 0..63 bytes and done
 273
 274
 275 // Forward loop for medium length operands in which low four bits of %rsi == 0010
 276
 277 LMod2:
 278         movdqa  -2(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 279 1:                                      // loop over 64-byte chunks
 280         movdqa  14(%rsi,%rcx),%xmm1
 281         movdqa  30(%rsi,%rcx),%xmm2
 282         movdqa  46(%rsi,%rcx),%xmm3
 283         movdqa  62(%rsi,%rcx),%xmm4
 284
 285         movdqa  %xmm0,%xmm5
 286         movdqa  %xmm4,%xmm0
 287
 288         palignr $2,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 289         palignr $2,%xmm2,%xmm3
 290         palignr $2,%xmm1,%xmm2
 291         palignr $2,%xmm5,%xmm1
 292
 293         movdqa  %xmm1,(%rdi,%rcx)
 294         movdqa  %xmm2,16(%rdi,%rcx)
 295         movdqa  %xmm3,32(%rdi,%rcx)
 296         movdqa  %xmm4,48(%rdi,%rcx)
 297
 298         addq    $64,%rcx
 299         jnz     1b
 300
 301         jmp     LShort                  // copy remaining 0..63 bytes and done
 302
 303
 304 // Forward loop for medium length operands in which low four bits of %rsi == 0011
 305
 306 LMod3:
 307         movdqa  -3(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 308 1:                                      // loop over 64-byte chunks
 309         movdqa  13(%rsi,%rcx),%xmm1
 310         movdqa  29(%rsi,%rcx),%xmm2
 311         movdqa  45(%rsi,%rcx),%xmm3
 312         movdqa  61(%rsi,%rcx),%xmm4
 313
 314         movdqa  %xmm0,%xmm5
 315         movdqa  %xmm4,%xmm0
 316
 317         palignr $3,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 318         palignr $3,%xmm2,%xmm3
 319         palignr $3,%xmm1,%xmm2
 320         palignr $3,%xmm5,%xmm1
 321
 322         movdqa  %xmm1,(%rdi,%rcx)
 323         movdqa  %xmm2,16(%rdi,%rcx)
 324         movdqa  %xmm3,32(%rdi,%rcx)
 325         movdqa  %xmm4,48(%rdi,%rcx)
 326
 327         addq    $64,%rcx
 328         jnz     1b
 329
 330         jmp     LShort                  // copy remaining 0..63 bytes and done
 331
 332
 333 // Forward loop for medium length operands in which low four bits of %rsi == 0100
 334 // We use the float single data type in order to use "movss" to merge vectors.
 335
 336 LMod4:
 337         movaps  -4(%rsi,%rcx),%xmm0     // 4-byte aligned: prime the loop
 338         jmp     1f
 339         .align  4,0x90
 340 1:                                      // loop over 64-byte chunks
 341         movaps  12(%rsi,%rcx),%xmm1
 342         movaps  28(%rsi,%rcx),%xmm2
 343         movss   %xmm1,%xmm0             // copy low 4 bytes of source into destination
 344         pshufd  $(0x39),%xmm0,%xmm0     // rotate right 4 bytes (mask -- 00 11 10 01)
 345         movaps  44(%rsi,%rcx),%xmm3
 346         movss   %xmm2,%xmm1
 347         pshufd  $(0x39),%xmm1,%xmm1
 348         movaps  60(%rsi,%rcx),%xmm4
 349         movss   %xmm3,%xmm2
 350         pshufd  $(0x39),%xmm2,%xmm2
 351
 352         movaps  %xmm0,(%rdi,%rcx)
 353         movss   %xmm4,%xmm3
 354         pshufd  $(0x39),%xmm3,%xmm3
 355         movaps  %xmm1,16(%rdi,%rcx)
 356         movaps  %xmm2,32(%rdi,%rcx)
 357         movaps  %xmm4,%xmm0
 358         movaps  %xmm3,48(%rdi,%rcx)
 359
 360         addq    $64,%rcx
 361         jnz     1b
 362
 363         jmp     LShort                  // copy remaining 0..63 bytes and done
 364
 365
 366 // Forward loop for medium length operands in which low four bits of %rsi == 0101
 367
 368 LMod5:
 369         movdqa  -5(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 370 1:                                      // loop over 64-byte chunks
 371         movdqa  11(%rsi,%rcx),%xmm1
 372         movdqa  27(%rsi,%rcx),%xmm2
 373         movdqa  43(%rsi,%rcx),%xmm3
 374         movdqa  59(%rsi,%rcx),%xmm4
 375
 376         movdqa  %xmm0,%xmm5
 377         movdqa  %xmm4,%xmm0
 378
 379         palignr $5,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 380         palignr $5,%xmm2,%xmm3
 381         palignr $5,%xmm1,%xmm2
 382         palignr $5,%xmm5,%xmm1
 383
 384         movdqa  %xmm1,(%rdi,%rcx)
 385         movdqa  %xmm2,16(%rdi,%rcx)
 386         movdqa  %xmm3,32(%rdi,%rcx)
 387         movdqa  %xmm4,48(%rdi,%rcx)
 388
 389         addq    $64,%rcx
 390         jnz     1b
 391
 392         jmp     LShort                  // copy remaining 0..63 bytes and done
 393
 394
 395 // Forward loop for medium length operands in which low four bits of %rsi == 0110
 396
 397 LMod6:
 398         movdqa  -6(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 399 1:                                      // loop over 64-byte chunks
 400         movdqa  10(%rsi,%rcx),%xmm1
 401         movdqa  26(%rsi,%rcx),%xmm2
 402         movdqa  42(%rsi,%rcx),%xmm3
 403         movdqa  58(%rsi,%rcx),%xmm4
 404
 405         movdqa  %xmm0,%xmm5
 406         movdqa  %xmm4,%xmm0
 407
 408         palignr $6,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 409         palignr $6,%xmm2,%xmm3
 410         palignr $6,%xmm1,%xmm2
 411         palignr $6,%xmm5,%xmm1
 412
 413         movdqa  %xmm1,(%rdi,%rcx)
 414         movdqa  %xmm2,16(%rdi,%rcx)
 415         movdqa  %xmm3,32(%rdi,%rcx)
 416         movdqa  %xmm4,48(%rdi,%rcx)
 417
 418         addq    $64,%rcx
 419         jnz     1b
 420
 421         jmp     LShort                  // copy remaining 0..63 bytes and done
 422
 423
 424 // Forward loop for medium length operands in which low four bits of %rsi == 0111
 425
 426 LMod7:
 427         movdqa  -7(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 428 1:                                      // loop over 64-byte chunks
 429         movdqa  9(%rsi,%rcx),%xmm1
 430         movdqa  25(%rsi,%rcx),%xmm2
 431         movdqa  41(%rsi,%rcx),%xmm3
 432         movdqa  57(%rsi,%rcx),%xmm4
 433
 434         movdqa  %xmm0,%xmm5
 435         movdqa  %xmm4,%xmm0
 436
 437         palignr $7,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 438         palignr $7,%xmm2,%xmm3
 439         palignr $7,%xmm1,%xmm2
 440         palignr $7,%xmm5,%xmm1
 441
 442         movdqa  %xmm1,(%rdi,%rcx)
 443         movdqa  %xmm2,16(%rdi,%rcx)
 444         movdqa  %xmm3,32(%rdi,%rcx)
 445         movdqa  %xmm4,48(%rdi,%rcx)
 446
 447         addq    $64,%rcx
 448         jnz     1b
 449
 450         jmp     LShort                  // copy remaining 0..63 bytes and done
 451
 452
 453 // Forward loop for medium length operands in which low four bits of %rsi == 1000
 454 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
 455
 456 LMod8:
 457         cmpl    $(-kFastUCode),%ecx     // %rcx == -length, where (length < kVeryLong)
 458         jle     Lfastpath               // long enough for fastpath in microcode
 459         movapd  -8(%rsi,%rcx),%xmm0     // 8-byte aligned: prime the loop
 460         jmp     1f
 461         .align  4,0x90
 462 1:                                      // loop over 64-byte chunks
 463         movapd  8(%rsi,%rcx),%xmm1
 464         movapd  24(%rsi,%rcx),%xmm2
 465         shufpd  $01,%xmm1,%xmm0         // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
 466         movapd  40(%rsi,%rcx),%xmm3
 467         shufpd  $01,%xmm2,%xmm1
 468         movapd  56(%rsi,%rcx),%xmm4
 469         shufpd  $01,%xmm3,%xmm2
 470
 471         movapd  %xmm0,(%rdi,%rcx)
 472         shufpd  $01,%xmm4,%xmm3
 473         movapd  %xmm1,16(%rdi,%rcx)
 474         movapd  %xmm2,32(%rdi,%rcx)
 475         movapd  %xmm4,%xmm0
 476         movapd  %xmm3,48(%rdi,%rcx)
 477
 478         addq    $64,%rcx
 479         jnz     1b
 480
 481         jmp     LShort                  // copy remaining 0..63 bytes and done
 482
 483
 484 // Forward loop for medium length operands in which low four bits of %rsi == 1001
 485
 486 LMod9:
 487         movdqa  -9(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 488 1:                                      // loop over 64-byte chunks
 489         movdqa  7(%rsi,%rcx),%xmm1
 490         movdqa  23(%rsi,%rcx),%xmm2
 491         movdqa  39(%rsi,%rcx),%xmm3
 492         movdqa  55(%rsi,%rcx),%xmm4
 493
 494         movdqa  %xmm0,%xmm5
 495         movdqa  %xmm4,%xmm0
 496
 497         palignr $9,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 498         palignr $9,%xmm2,%xmm3
 499         palignr $9,%xmm1,%xmm2
 500         palignr $9,%xmm5,%xmm1
 501
 502         movdqa  %xmm1,(%rdi,%rcx)
 503         movdqa  %xmm2,16(%rdi,%rcx)
 504         movdqa  %xmm3,32(%rdi,%rcx)
 505         movdqa  %xmm4,48(%rdi,%rcx)
 506
 507         addq    $64,%rcx
 508         jnz     1b
 509
 510         jmp     LShort                  // copy remaining 0..63 bytes and done
 511
 512
 513 // Forward loop for medium length operands in which low four bits of %rsi == 1010
 514
 515 LMod10:
 516         movdqa  -10(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 517 1:                                      // loop over 64-byte chunks
 518         movdqa  6(%rsi,%rcx),%xmm1
 519         movdqa  22(%rsi,%rcx),%xmm2
 520         movdqa  38(%rsi,%rcx),%xmm3
 521         movdqa  54(%rsi,%rcx),%xmm4
 522
 523         movdqa  %xmm0,%xmm5
 524         movdqa  %xmm4,%xmm0
 525
 526         palignr $10,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 527         palignr $10,%xmm2,%xmm3
 528         palignr $10,%xmm1,%xmm2
 529         palignr $10,%xmm5,%xmm1
 530
 531         movdqa  %xmm1,(%rdi,%rcx)
 532         movdqa  %xmm2,16(%rdi,%rcx)
 533         movdqa  %xmm3,32(%rdi,%rcx)
 534         movdqa  %xmm4,48(%rdi,%rcx)
 535
 536         addq    $64,%rcx
 537         jnz     1b
 538
 539         jmp     LShort                  // copy remaining 0..63 bytes and done
 540
 541
 542 // Forward loop for medium length operands in which low four bits of %rsi == 1011
 543
 544 LMod11:
 545         movdqa  -11(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 546 1:                                      // loop over 64-byte chunks
 547         movdqa  5(%rsi,%rcx),%xmm1
 548         movdqa  21(%rsi,%rcx),%xmm2
 549         movdqa  37(%rsi,%rcx),%xmm3
 550         movdqa  53(%rsi,%rcx),%xmm4
 551
 552         movdqa  %xmm0,%xmm5
 553         movdqa  %xmm4,%xmm0
 554
 555         palignr $11,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 556         palignr $11,%xmm2,%xmm3
 557         palignr $11,%xmm1,%xmm2
 558         palignr $11,%xmm5,%xmm1
 559
 560         movdqa  %xmm1,(%rdi,%rcx)
 561         movdqa  %xmm2,16(%rdi,%rcx)
 562         movdqa  %xmm3,32(%rdi,%rcx)
 563         movdqa  %xmm4,48(%rdi,%rcx)
 564
 565         addq    $64,%rcx
 566         jnz     1b
 567
 568         jmp     LShort                  // copy remaining 0..63 bytes and done
 569
 570
 571 // Forward loop for medium length operands in which low four bits of %rsi == 1100
 572 // We use the float single data type in order to use "movss" to merge vectors.
 573
 574 LMod12:
 575         movss   (%rsi,%rcx),%xmm0       // prefetch 1st four bytes of source, right justified
 576         jmp     1f
 577         .align  4,0x90
 578 1:                                      // loop over 64-byte chunks
 579         pshufd  $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
 580         pshufd  $(0x93),20(%rsi,%rcx),%xmm2
 581         pshufd  $(0x93),36(%rsi,%rcx),%xmm3
 582         pshufd  $(0x93),52(%rsi,%rcx),%xmm4
 583
 584         movaps  %xmm4,%xmm5
 585         movss   %xmm3,%xmm4             // copy low 4 bytes of source into destination
 586         movss   %xmm2,%xmm3
 587         movss   %xmm1,%xmm2
 588         movss   %xmm0,%xmm1
 589
 590         movaps  %xmm1,(%rdi,%rcx)
 591         movaps  %xmm2,16(%rdi,%rcx)
 592         movaps  %xmm5,%xmm0
 593         movaps  %xmm3,32(%rdi,%rcx)
 594         movaps  %xmm4,48(%rdi,%rcx)
 595
 596         addq    $64,%rcx
 597         jnz     1b
 598
 599         jmp     LShort                  // copy remaining 0..63 bytes and done
 600
 601
 602 // Forward loop for medium length operands in which low four bits of %rsi == 1101
 603
 604 LMod13:
 605         movdqa  -13(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 606 1:                                      // loop over 64-byte chunks
 607         movdqa  3(%rsi,%rcx),%xmm1
 608         movdqa  19(%rsi,%rcx),%xmm2
 609         movdqa  35(%rsi,%rcx),%xmm3
 610         movdqa  51(%rsi,%rcx),%xmm4
 611
 612         movdqa  %xmm0,%xmm5
 613         movdqa  %xmm4,%xmm0
 614
 615         palignr $13,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 616         palignr $13,%xmm2,%xmm3
 617         palignr $13,%xmm1,%xmm2
 618         palignr $13,%xmm5,%xmm1
 619
 620         movdqa  %xmm1,(%rdi,%rcx)
 621         movdqa  %xmm2,16(%rdi,%rcx)
 622         movdqa  %xmm3,32(%rdi,%rcx)
 623         movdqa  %xmm4,48(%rdi,%rcx)
 624
 625         addq    $64,%rcx
 626         jnz     1b
 627
 628         jmp     LShort                  // copy remaining 0..63 bytes and done
 629
 630
 631 // Forward loop for medium length operands in which low four bits of %rsi == 1110
 632
 633 LMod14:
 634         movdqa  -14(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 635 1:                                      // loop over 64-byte chunks
 636         movdqa  2(%rsi,%rcx),%xmm1
 637         movdqa  18(%rsi,%rcx),%xmm2
 638         movdqa  34(%rsi,%rcx),%xmm3
 639         movdqa  50(%rsi,%rcx),%xmm4
 640
 641         movdqa  %xmm0,%xmm5
 642         movdqa  %xmm4,%xmm0
 643
 644         palignr $14,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 645         palignr $14,%xmm2,%xmm3
 646         palignr $14,%xmm1,%xmm2
 647         palignr $14,%xmm5,%xmm1
 648
 649         movdqa  %xmm1,(%rdi,%rcx)
 650         movdqa  %xmm2,16(%rdi,%rcx)
 651         movdqa  %xmm3,32(%rdi,%rcx)
 652         movdqa  %xmm4,48(%rdi,%rcx)
 653
 654         addq    $64,%rcx
 655         jnz     1b
 656
 657         jmp     LShort                  // copy remaining 0..63 bytes and done
 658
 659
 660 // Forward loop for medium length operands in which low four bits of %rsi == 1111
 661
 662 LMod15:
 663         movdqa  -15(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 664 1:                                      // loop over 64-byte chunks
 665         movdqa  1(%rsi,%rcx),%xmm1
 666         movdqa  17(%rsi,%rcx),%xmm2
 667         movdqa  33(%rsi,%rcx),%xmm3
 668         movdqa  49(%rsi,%rcx),%xmm4
 669
 670         movdqa  %xmm0,%xmm5
 671         movdqa  %xmm4,%xmm0
 672
 673         palignr $15,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 674         palignr $15,%xmm2,%xmm3
 675         palignr $15,%xmm1,%xmm2
 676         palignr $15,%xmm5,%xmm1
 677
 678         movdqa  %xmm1,(%rdi,%rcx)
 679         movdqa  %xmm2,16(%rdi,%rcx)
 680         movdqa  %xmm3,32(%rdi,%rcx)
 681         movdqa  %xmm4,48(%rdi,%rcx)
 682
 683         addq    $64,%rcx
 684         jnz     1b
 685
 686         jmp     LShort                  // copy remaining 0..63 bytes and done
 687
 688
 689 // Reverse moves.  These are not optimized as aggressively as their forward
 690 // counterparts, as they are only used with destructive overlap.
 691 //      rdx = length
 692 //      rsi = source ptr
 693 //      rdi = dest ptr
 694
 695 LReverse:
 696         addq    %rdx,%rsi               // point to end of strings
 697         addq    %rdx,%rdi
 698         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
 699         ja      LReverseNotShort        // yes
 700
 701 // Handle reverse short copies.
 702 //      edx = length (<= kShort)
 703 //      rsi = one byte past end of source
 704 //      rdi = one byte past end of dest
 705
 706 LReverseShort:
 707         movl    %edx,%ecx               // copy length
 708         shrl    $3,%ecx                 // #quadwords
 709         jz      3f
 710 1:
 711         subq    $8,%rsi
 712         movq    (%rsi),%rax
 713         subq    $8,%rdi
 714         movq    %rax,(%rdi)
 715         decl    %ecx
 716         jnz     1b
 717 3:
 718         andl    $7,%edx                 // bytes?
 719         jz      5f
 720 4:
 721         decq    %rsi
 722         movb    (%rsi),%al
 723         decq    %rdi
 724         movb    %al,(%rdi)
 725         decl    %edx
 726         jnz     4b
 727 5:
 728         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 729         popq    %rbp
 730         ret
 731
 732 // Handle a reverse move long enough to justify using SSE.
 733 //      rdx = length (> kShort)
 734 //      rsi = one byte past end of source
 735 //      rdi = one byte past end of dest
 736
 737 LReverseNotShort:
 738         movl    %edi,%ecx               // copy destination
 739         andl    $15,%ecx                // get #bytes to align destination
 740         je      LReverseDestAligned     // already aligned
 741         subq    %rcx,%rdx               // adjust length
 742 1:                                      // loop copying 1..15 bytes
 743         decq    %rsi
 744         movb    (%rsi),%al
 745         decq    %rdi
 746         movb    %al,(%rdi)
 747         decl    %ecx
 748         jnz     1b
 749
 750 // Destination is now aligned.  Prepare for reverse loops.
 751
 752 LReverseDestAligned:
 753         movq    %rdx,%rcx               // copy length
 754         andl    $63,%edx                // get remaining bytes for LReverseShort
 755         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 756         subq    %rcx,%rsi               // point to endpoint of copy
 757         subq    %rcx,%rdi
 758         testl   $15,%esi                // is source aligned too?
 759         jnz     LReverseUnalignedLoop   // no
 760
 761 LReverseAlignedLoop:                    // loop over 64-byte chunks
 762         movdqa  -16(%rsi,%rcx),%xmm0
 763         movdqa  -32(%rsi,%rcx),%xmm1
 764         movdqa  -48(%rsi,%rcx),%xmm2
 765         movdqa  -64(%rsi,%rcx),%xmm3
 766
 767         movdqa  %xmm0,-16(%rdi,%rcx)
 768         movdqa  %xmm1,-32(%rdi,%rcx)
 769         movdqa  %xmm2,-48(%rdi,%rcx)
 770         movdqa  %xmm3,-64(%rdi,%rcx)
 771
 772         subq    $64,%rcx
 773         jne     LReverseAlignedLoop
 774
 775         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 776
 777
 778 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 779
 780 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 781         movdqu  -16(%rsi,%rcx),%xmm0
 782         movdqu  -32(%rsi,%rcx),%xmm1
 783         movdqu  -48(%rsi,%rcx),%xmm2
 784         movdqu  -64(%rsi,%rcx),%xmm3
 785
 786         movdqa  %xmm0,-16(%rdi,%rcx)
 787         movdqa  %xmm1,-32(%rdi,%rcx)
 788         movdqa  %xmm2,-48(%rdi,%rcx)
 789         movdqa  %xmm3,-64(%rdi,%rcx)
 790
 791         subq    $64,%rcx
 792         jne     LReverseUnalignedLoop
 793
 794         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 795
 796
 797         COMMPAGE_DESCRIPTOR(bcopy_sse4_64,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)