osfmk/i386/commpage/bcopy_sse4_64.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <machine/cpu_capabilities.h>
  24 #include <machine/commpage.h>
  25
  26 /*
  27  * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
  28  * SSE4 and 64-byte cache lines.  This is the 64-bit version.
  29  *
  30  * The following #defines are tightly coupled to the u-architecture:
  31  */
  32
  33 #define kShort  80                      // too short to bother with SSE (must be >=80)
  34 #define kVeryLong   (500*1024)          // large enough for non-temporal stores (>=8192 and <2GB)
  35 #define kFastUCode  ((16*1024)-15)      // cutoff for microcode fastpath for "rep/movsl"
  36
  37
  38 // void bcopy(const void *src, void *dst, size_t len);
  39
  40         .text
  41         .code64
  42         .align 5, 0x90
  43 LZero:
  44 Lbcopy_sse4_64:                         // void bcopy(const void *src, void *dst, size_t len)
  45         pushq   %rbp                    // set up a frame for backtraces
  46         movq    %rsp,%rbp
  47         movq    %rsi,%rax               // copy dest ptr
  48         movq    %rdi,%rsi               // xchange source and dest ptrs
  49         movq    %rax,%rdi
  50         subq    %rsi,%rax               // (dest - source)
  51         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  52         jb      LReverseIsland
  53         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  54         jbe     LShort                  // no
  55         jmp     LNotShort
  56
  57 //
  58 // void *memcpy(void *dst, const void *src, size_t len);
  59 // void *memmove(void *dst, const void *src, size_t len);
  60 //
  61 // NB: These need to be 32 bytes from bcopy():
  62 //
  63
  64         .align  5, 0x90
  65 Lmemcpy:                                // void *memcpy(void *dst, const void *src, size_t len)
  66 Lmemmove:                               // void *memmove(void *dst, const void *src, size_t len)
  67         pushq   %rbp                    // set up a frame for backtraces
  68         movq    %rsp,%rbp
  69         movq    %rdi,%r11               // save return value here
  70         movq    %rdi,%rax
  71         subq    %rsi,%rax               // (dest - source)
  72         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  73         jb      LReverseIsland
  74         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  75         ja      LNotShort               // yes
  76
  77 // Handle short forward copies.  As the most common case, this is the fall-through path.
  78 //      rdx = length (<= kShort)
  79 //      rsi = source ptr
  80 //      rdi = dest ptr
  81
  82 LShort:
  83         movl    %edx,%ecx               // copy length using 32-bit operation
  84         shrl    $2,%ecx                 // get #doublewords
  85         jz      LLeftovers
  86 2:                                      // loop copying doublewords
  87         movl    (%rsi),%eax
  88         addq    $4,%rsi
  89         movl    %eax,(%rdi)
  90         addq    $4,%rdi
  91         decl    %ecx
  92         jnz     2b
  93 LLeftovers:                             // handle leftover bytes (0..3) in last word
  94         andl    $3,%edx                 // any leftover bytes?
  95         jz      5f
  96 4:                                      // loop copying bytes
  97         movb    (%rsi),%al
  98         incq    %rsi
  99         movb    %al,(%rdi)
 100         incq    %rdi
 101         decl    %edx
 102         jnz     4b
 103 5:
 104         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 105         popq    %rbp
 106         ret
 107
 108
 109 LReverseIsland:                         // keep the "jb" above a short branch...
 110         jmp     LReverse                // ...because reverse moves are uncommon
 111
 112
 113 // Handle forward moves that are long enough to justify use of SSE.
 114 // First, 16-byte align the destination.
 115 //      rdx = length (> kShort)
 116 //      rsi = source ptr
 117 //      rdi = dest ptr
 118
 119 LNotShort:
 120         cmpq    $(kVeryLong),%rdx       // long enough to justify heavyweight loops?
 121         jae     LVeryLong               // use very-long-operand path
 122         movl    %edi,%ecx               // copy low half of destination ptr
 123         negl    %ecx
 124         andl    $15,%ecx                // get #bytes to align destination
 125         jz      LDestAligned            // already aligned
 126         subl    %ecx,%edx               // decrement length
 127         rep                             // align destination
 128         movsb
 129
 130
 131 // Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
 132 // based on the alignment of the source.  All vector loads and stores are aligned.
 133 // Even though this means we have to shift and repack vectors, doing so is much faster
 134 // than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
 135 // there is at least one chunk.  When we enter the copy loops, the following registers
 136 // are set up:
 137 //      rdx = residual length (0..63)
 138 //      rcx = -(length to move), a multiple of 64 less than 2GB
 139 //      rsi = ptr to 1st source byte not to move (unaligned)
 140 //      rdi = ptr to 1st dest byte not to move (aligned)
 141
 142 LDestAligned:
 143         movl    %edx,%ecx               // copy length
 144         movl    %esi,%eax               // copy low half of source address
 145         andl    $63,%edx                // get remaining bytes for LShort
 146         andl    $15,%eax                // mask to low 4 bits of source address
 147         andl    $-64,%ecx               // get number of bytes we will copy in inner loop
 148 // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block.
 149 //      lea     LTable(%rip),%r8        // point to dispatch table
 150         movq    $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528
 151         addq    $(LTable-LZero),%r8     // work around 4586528
 152         addq    %rcx,%rsi               // point to 1st byte not copied
 153         addq    %rcx,%rdi
 154         movl    (%r8,%rax,4),%eax       // get offset of routine
 155         negq    %rcx                    // now generate offset to 1st byte to be copied
 156         addq    %r8,%rax                // generate address of copy loop
 157         jmp     *%rax                   // enter copy loop, selected by source alignment
 158
 159         .align  2
 160 LTable:                                 // table of copy loop addresses
 161         .long   (LMod0 - LTable)
 162         .long   (LMod1 - LTable)
 163         .long   (LMod2 - LTable)
 164         .long   (LMod3 - LTable)
 165         .long   (LMod4 - LTable)
 166         .long   (LMod5 - LTable)
 167         .long   (LMod6 - LTable)
 168         .long   (LMod7 - LTable)
 169         .long   (LMod8 - LTable)
 170         .long   (LMod9 - LTable)
 171         .long   (LMod10 - LTable)
 172         .long   (LMod11 - LTable)
 173         .long   (LMod12 - LTable)
 174         .long   (LMod13 - LTable)
 175         .long   (LMod14 - LTable)
 176         .long   (LMod15 - LTable)
 177
 178
 179 // Very long forward moves.  These are at least several pages.  They are special cased
 180 // and aggressively optimized, not so much because they are common or useful, but
 181 // because they are subject to benchmark.  There isn't enough room for them in the
 182 // area reserved on the commpage for bcopy, so we put them elsewhere.  We call
 183 // the longcopy routine using the normal ABI:
 184 //      rdi = dest
 185 //      rsi = source
 186 //      rdx = length (>= kVeryLong bytes)
 187
 188 LVeryLong:
 189         pushq   %r11                    // save return value
 190         movq    $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
 191         call    *%rax                   // call very long operand routine
 192         popq    %rax                    // pop return value
 193         popq    %rbp
 194         ret
 195
 196
 197 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
 198 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
 199 // about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
 200 // avoids having to read destination cache lines that will be completely overwritten.
 201 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
 202 // we do not know if the destination is in cache or not.
 203
 204 Lfastpath:
 205         addq    %rcx,%rsi               // restore ptrs to 1st byte of source and dest
 206         addq    %rcx,%rdi
 207         negl    %ecx                    // make length positive (known to be < 2GB)
 208         orl     %edx,%ecx               // restore total #bytes remaining to move
 209         cld                             // we'll move forward
 210         shrl    $2,%ecx                 // compute #words to move
 211         rep                             // the u-code will optimize this
 212         movsl
 213         jmp     LLeftovers              // handle 0..3 leftover bytes
 214
 215
 216 // Forward loop for medium length operands in which low four bits of %rsi == 0000
 217
 218 LMod0:
 219         cmpl    $(-kFastUCode),%ecx     // %rcx == -length, where (length < kVeryLong)
 220         jle     Lfastpath               // long enough for fastpath in microcode
 221         jmp     1f
 222         .align  4,0x90                  // 16-byte align inner loops
 223 1:                                      // loop over 64-byte chunks
 224         movdqa  (%rsi,%rcx),%xmm0
 225         movdqa  16(%rsi,%rcx),%xmm1
 226         movdqa  32(%rsi,%rcx),%xmm2
 227         movdqa  48(%rsi,%rcx),%xmm3
 228
 229         movdqa  %xmm0,(%rdi,%rcx)
 230         movdqa  %xmm1,16(%rdi,%rcx)
 231         movdqa  %xmm2,32(%rdi,%rcx)
 232         movdqa  %xmm3,48(%rdi,%rcx)
 233
 234         addq    $64,%rcx
 235         jnz     1b
 236
 237         jmp     LShort                  // copy remaining 0..63 bytes and done
 238
 239
 240 // Forward loop for medium length operands in which low four bits of %rsi == 0001
 241
 242 LMod1:
 243         movdqa  -1(%rsi,%rcx),%xmm0     // prime the loop by loading 1st quadword
 244 1:                                      // loop over 64-byte chunks
 245         movdqa  15(%rsi,%rcx),%xmm1
 246         movdqa  31(%rsi,%rcx),%xmm2
 247         movdqa  47(%rsi,%rcx),%xmm3
 248         movdqa  63(%rsi,%rcx),%xmm4
 249
 250         movdqa  %xmm0,%xmm5
 251         movdqa  %xmm4,%xmm0
 252
 253         palignr $1,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 254         palignr $1,%xmm2,%xmm3
 255         palignr $1,%xmm1,%xmm2
 256         palignr $1,%xmm5,%xmm1
 257
 258         movdqa  %xmm1,(%rdi,%rcx)
 259         movdqa  %xmm2,16(%rdi,%rcx)
 260         movdqa  %xmm3,32(%rdi,%rcx)
 261         movdqa  %xmm4,48(%rdi,%rcx)
 262
 263         addq    $64,%rcx
 264         jnz     1b
 265
 266         jmp     LShort                  // copy remaining 0..63 bytes and done
 267
 268
 269 // Forward loop for medium length operands in which low four bits of %rsi == 0010
 270
 271 LMod2:
 272         movdqa  -2(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 273 1:                                      // loop over 64-byte chunks
 274         movdqa  14(%rsi,%rcx),%xmm1
 275         movdqa  30(%rsi,%rcx),%xmm2
 276         movdqa  46(%rsi,%rcx),%xmm3
 277         movdqa  62(%rsi,%rcx),%xmm4
 278
 279         movdqa  %xmm0,%xmm5
 280         movdqa  %xmm4,%xmm0
 281
 282         palignr $2,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 283         palignr $2,%xmm2,%xmm3
 284         palignr $2,%xmm1,%xmm2
 285         palignr $2,%xmm5,%xmm1
 286
 287         movdqa  %xmm1,(%rdi,%rcx)
 288         movdqa  %xmm2,16(%rdi,%rcx)
 289         movdqa  %xmm3,32(%rdi,%rcx)
 290         movdqa  %xmm4,48(%rdi,%rcx)
 291
 292         addq    $64,%rcx
 293         jnz     1b
 294
 295         jmp     LShort                  // copy remaining 0..63 bytes and done
 296
 297
 298 // Forward loop for medium length operands in which low four bits of %rsi == 0011
 299
 300 LMod3:
 301         movdqa  -3(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 302 1:                                      // loop over 64-byte chunks
 303         movdqa  13(%rsi,%rcx),%xmm1
 304         movdqa  29(%rsi,%rcx),%xmm2
 305         movdqa  45(%rsi,%rcx),%xmm3
 306         movdqa  61(%rsi,%rcx),%xmm4
 307
 308         movdqa  %xmm0,%xmm5
 309         movdqa  %xmm4,%xmm0
 310
 311         palignr $3,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 312         palignr $3,%xmm2,%xmm3
 313         palignr $3,%xmm1,%xmm2
 314         palignr $3,%xmm5,%xmm1
 315
 316         movdqa  %xmm1,(%rdi,%rcx)
 317         movdqa  %xmm2,16(%rdi,%rcx)
 318         movdqa  %xmm3,32(%rdi,%rcx)
 319         movdqa  %xmm4,48(%rdi,%rcx)
 320
 321         addq    $64,%rcx
 322         jnz     1b
 323
 324         jmp     LShort                  // copy remaining 0..63 bytes and done
 325
 326
 327 // Forward loop for medium length operands in which low four bits of %rsi == 0100
 328 // We use the float single data type in order to use "movss" to merge vectors.
 329
 330 LMod4:
 331         movaps  -4(%rsi,%rcx),%xmm0     // 4-byte aligned: prime the loop
 332         jmp     1f
 333         .align  4,0x90
 334 1:                                      // loop over 64-byte chunks
 335         movaps  12(%rsi,%rcx),%xmm1
 336         movaps  28(%rsi,%rcx),%xmm2
 337         movss   %xmm1,%xmm0             // copy low 4 bytes of source into destination
 338         pshufd  $(0x39),%xmm0,%xmm0     // rotate right 4 bytes (mask -- 00 11 10 01)
 339         movaps  44(%rsi,%rcx),%xmm3
 340         movss   %xmm2,%xmm1
 341         pshufd  $(0x39),%xmm1,%xmm1
 342         movaps  60(%rsi,%rcx),%xmm4
 343         movss   %xmm3,%xmm2
 344         pshufd  $(0x39),%xmm2,%xmm2
 345
 346         movaps  %xmm0,(%rdi,%rcx)
 347         movss   %xmm4,%xmm3
 348         pshufd  $(0x39),%xmm3,%xmm3
 349         movaps  %xmm1,16(%rdi,%rcx)
 350         movaps  %xmm2,32(%rdi,%rcx)
 351         movaps  %xmm4,%xmm0
 352         movaps  %xmm3,48(%rdi,%rcx)
 353
 354         addq    $64,%rcx
 355         jnz     1b
 356
 357         jmp     LShort                  // copy remaining 0..63 bytes and done
 358
 359
 360 // Forward loop for medium length operands in which low four bits of %rsi == 0101
 361
 362 LMod5:
 363         movdqa  -5(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 364 1:                                      // loop over 64-byte chunks
 365         movdqa  11(%rsi,%rcx),%xmm1
 366         movdqa  27(%rsi,%rcx),%xmm2
 367         movdqa  43(%rsi,%rcx),%xmm3
 368         movdqa  59(%rsi,%rcx),%xmm4
 369
 370         movdqa  %xmm0,%xmm5
 371         movdqa  %xmm4,%xmm0
 372
 373         palignr $5,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 374         palignr $5,%xmm2,%xmm3
 375         palignr $5,%xmm1,%xmm2
 376         palignr $5,%xmm5,%xmm1
 377
 378         movdqa  %xmm1,(%rdi,%rcx)
 379         movdqa  %xmm2,16(%rdi,%rcx)
 380         movdqa  %xmm3,32(%rdi,%rcx)
 381         movdqa  %xmm4,48(%rdi,%rcx)
 382
 383         addq    $64,%rcx
 384         jnz     1b
 385
 386         jmp     LShort                  // copy remaining 0..63 bytes and done
 387
 388
 389 // Forward loop for medium length operands in which low four bits of %rsi == 0110
 390
 391 LMod6:
 392         movdqa  -6(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 393 1:                                      // loop over 64-byte chunks
 394         movdqa  10(%rsi,%rcx),%xmm1
 395         movdqa  26(%rsi,%rcx),%xmm2
 396         movdqa  42(%rsi,%rcx),%xmm3
 397         movdqa  58(%rsi,%rcx),%xmm4
 398
 399         movdqa  %xmm0,%xmm5
 400         movdqa  %xmm4,%xmm0
 401
 402         palignr $6,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 403         palignr $6,%xmm2,%xmm3
 404         palignr $6,%xmm1,%xmm2
 405         palignr $6,%xmm5,%xmm1
 406
 407         movdqa  %xmm1,(%rdi,%rcx)
 408         movdqa  %xmm2,16(%rdi,%rcx)
 409         movdqa  %xmm3,32(%rdi,%rcx)
 410         movdqa  %xmm4,48(%rdi,%rcx)
 411
 412         addq    $64,%rcx
 413         jnz     1b
 414
 415         jmp     LShort                  // copy remaining 0..63 bytes and done
 416
 417
 418 // Forward loop for medium length operands in which low four bits of %rsi == 0111
 419
 420 LMod7:
 421         movdqa  -7(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 422 1:                                      // loop over 64-byte chunks
 423         movdqa  9(%rsi,%rcx),%xmm1
 424         movdqa  25(%rsi,%rcx),%xmm2
 425         movdqa  41(%rsi,%rcx),%xmm3
 426         movdqa  57(%rsi,%rcx),%xmm4
 427
 428         movdqa  %xmm0,%xmm5
 429         movdqa  %xmm4,%xmm0
 430
 431         palignr $7,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 432         palignr $7,%xmm2,%xmm3
 433         palignr $7,%xmm1,%xmm2
 434         palignr $7,%xmm5,%xmm1
 435
 436         movdqa  %xmm1,(%rdi,%rcx)
 437         movdqa  %xmm2,16(%rdi,%rcx)
 438         movdqa  %xmm3,32(%rdi,%rcx)
 439         movdqa  %xmm4,48(%rdi,%rcx)
 440
 441         addq    $64,%rcx
 442         jnz     1b
 443
 444         jmp     LShort                  // copy remaining 0..63 bytes and done
 445
 446
 447 // Forward loop for medium length operands in which low four bits of %rsi == 1000
 448 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
 449
 450 LMod8:
 451         cmpl    $(-kFastUCode),%ecx     // %rcx == -length, where (length < kVeryLong)
 452         jle     Lfastpath               // long enough for fastpath in microcode
 453         movapd  -8(%rsi,%rcx),%xmm0     // 8-byte aligned: prime the loop
 454         jmp     1f
 455         .align  4,0x90
 456 1:                                      // loop over 64-byte chunks
 457         movapd  8(%rsi,%rcx),%xmm1
 458         movapd  24(%rsi,%rcx),%xmm2
 459         shufpd  $01,%xmm1,%xmm0         // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
 460         movapd  40(%rsi,%rcx),%xmm3
 461         shufpd  $01,%xmm2,%xmm1
 462         movapd  56(%rsi,%rcx),%xmm4
 463         shufpd  $01,%xmm3,%xmm2
 464
 465         movapd  %xmm0,(%rdi,%rcx)
 466         shufpd  $01,%xmm4,%xmm3
 467         movapd  %xmm1,16(%rdi,%rcx)
 468         movapd  %xmm2,32(%rdi,%rcx)
 469         movapd  %xmm4,%xmm0
 470         movapd  %xmm3,48(%rdi,%rcx)
 471
 472         addq    $64,%rcx
 473         jnz     1b
 474
 475         jmp     LShort                  // copy remaining 0..63 bytes and done
 476
 477
 478 // Forward loop for medium length operands in which low four bits of %rsi == 1001
 479
 480 LMod9:
 481         movdqa  -9(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 482 1:                                      // loop over 64-byte chunks
 483         movdqa  7(%rsi,%rcx),%xmm1
 484         movdqa  23(%rsi,%rcx),%xmm2
 485         movdqa  39(%rsi,%rcx),%xmm3
 486         movdqa  55(%rsi,%rcx),%xmm4
 487
 488         movdqa  %xmm0,%xmm5
 489         movdqa  %xmm4,%xmm0
 490
 491         palignr $9,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 492         palignr $9,%xmm2,%xmm3
 493         palignr $9,%xmm1,%xmm2
 494         palignr $9,%xmm5,%xmm1
 495
 496         movdqa  %xmm1,(%rdi,%rcx)
 497         movdqa  %xmm2,16(%rdi,%rcx)
 498         movdqa  %xmm3,32(%rdi,%rcx)
 499         movdqa  %xmm4,48(%rdi,%rcx)
 500
 501         addq    $64,%rcx
 502         jnz     1b
 503
 504         jmp     LShort                  // copy remaining 0..63 bytes and done
 505
 506
 507 // Forward loop for medium length operands in which low four bits of %rsi == 1010
 508
 509 LMod10:
 510         movdqa  -10(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 511 1:                                      // loop over 64-byte chunks
 512         movdqa  6(%rsi,%rcx),%xmm1
 513         movdqa  22(%rsi,%rcx),%xmm2
 514         movdqa  38(%rsi,%rcx),%xmm3
 515         movdqa  54(%rsi,%rcx),%xmm4
 516
 517         movdqa  %xmm0,%xmm5
 518         movdqa  %xmm4,%xmm0
 519
 520         palignr $10,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 521         palignr $10,%xmm2,%xmm3
 522         palignr $10,%xmm1,%xmm2
 523         palignr $10,%xmm5,%xmm1
 524
 525         movdqa  %xmm1,(%rdi,%rcx)
 526         movdqa  %xmm2,16(%rdi,%rcx)
 527         movdqa  %xmm3,32(%rdi,%rcx)
 528         movdqa  %xmm4,48(%rdi,%rcx)
 529
 530         addq    $64,%rcx
 531         jnz     1b
 532
 533         jmp     LShort                  // copy remaining 0..63 bytes and done
 534
 535
 536 // Forward loop for medium length operands in which low four bits of %rsi == 1011
 537
 538 LMod11:
 539         movdqa  -11(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 540 1:                                      // loop over 64-byte chunks
 541         movdqa  5(%rsi,%rcx),%xmm1
 542         movdqa  21(%rsi,%rcx),%xmm2
 543         movdqa  37(%rsi,%rcx),%xmm3
 544         movdqa  53(%rsi,%rcx),%xmm4
 545
 546         movdqa  %xmm0,%xmm5
 547         movdqa  %xmm4,%xmm0
 548
 549         palignr $11,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 550         palignr $11,%xmm2,%xmm3
 551         palignr $11,%xmm1,%xmm2
 552         palignr $11,%xmm5,%xmm1
 553
 554         movdqa  %xmm1,(%rdi,%rcx)
 555         movdqa  %xmm2,16(%rdi,%rcx)
 556         movdqa  %xmm3,32(%rdi,%rcx)
 557         movdqa  %xmm4,48(%rdi,%rcx)
 558
 559         addq    $64,%rcx
 560         jnz     1b
 561
 562         jmp     LShort                  // copy remaining 0..63 bytes and done
 563
 564
 565 // Forward loop for medium length operands in which low four bits of %rsi == 1100
 566 // We use the float single data type in order to use "movss" to merge vectors.
 567
 568 LMod12:
 569         movss   (%rsi,%rcx),%xmm0       // prefetch 1st four bytes of source, right justified
 570         jmp     1f
 571         .align  4,0x90
 572 1:                                      // loop over 64-byte chunks
 573         pshufd  $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
 574         pshufd  $(0x93),20(%rsi,%rcx),%xmm2
 575         pshufd  $(0x93),36(%rsi,%rcx),%xmm3
 576         pshufd  $(0x93),52(%rsi,%rcx),%xmm4
 577
 578         movaps  %xmm4,%xmm5
 579         movss   %xmm3,%xmm4             // copy low 4 bytes of source into destination
 580         movss   %xmm2,%xmm3
 581         movss   %xmm1,%xmm2
 582         movss   %xmm0,%xmm1
 583
 584         movaps  %xmm1,(%rdi,%rcx)
 585         movaps  %xmm2,16(%rdi,%rcx)
 586         movaps  %xmm5,%xmm0
 587         movaps  %xmm3,32(%rdi,%rcx)
 588         movaps  %xmm4,48(%rdi,%rcx)
 589
 590         addq    $64,%rcx
 591         jnz     1b
 592
 593         jmp     LShort                  // copy remaining 0..63 bytes and done
 594
 595
 596 // Forward loop for medium length operands in which low four bits of %rsi == 1101
 597
 598 LMod13:
 599         movdqa  -13(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 600 1:                                      // loop over 64-byte chunks
 601         movdqa  3(%rsi,%rcx),%xmm1
 602         movdqa  19(%rsi,%rcx),%xmm2
 603         movdqa  35(%rsi,%rcx),%xmm3
 604         movdqa  51(%rsi,%rcx),%xmm4
 605
 606         movdqa  %xmm0,%xmm5
 607         movdqa  %xmm4,%xmm0
 608
 609         palignr $13,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 610         palignr $13,%xmm2,%xmm3
 611         palignr $13,%xmm1,%xmm2
 612         palignr $13,%xmm5,%xmm1
 613
 614         movdqa  %xmm1,(%rdi,%rcx)
 615         movdqa  %xmm2,16(%rdi,%rcx)
 616         movdqa  %xmm3,32(%rdi,%rcx)
 617         movdqa  %xmm4,48(%rdi,%rcx)
 618
 619         addq    $64,%rcx
 620         jnz     1b
 621
 622         jmp     LShort                  // copy remaining 0..63 bytes and done
 623
 624
 625 // Forward loop for medium length operands in which low four bits of %rsi == 1110
 626
 627 LMod14:
 628         movdqa  -14(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 629 1:                                      // loop over 64-byte chunks
 630         movdqa  2(%rsi,%rcx),%xmm1
 631         movdqa  18(%rsi,%rcx),%xmm2
 632         movdqa  34(%rsi,%rcx),%xmm3
 633         movdqa  50(%rsi,%rcx),%xmm4
 634
 635         movdqa  %xmm0,%xmm5
 636         movdqa  %xmm4,%xmm0
 637
 638         palignr $14,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 639         palignr $14,%xmm2,%xmm3
 640         palignr $14,%xmm1,%xmm2
 641         palignr $14,%xmm5,%xmm1
 642
 643         movdqa  %xmm1,(%rdi,%rcx)
 644         movdqa  %xmm2,16(%rdi,%rcx)
 645         movdqa  %xmm3,32(%rdi,%rcx)
 646         movdqa  %xmm4,48(%rdi,%rcx)
 647
 648         addq    $64,%rcx
 649         jnz     1b
 650
 651         jmp     LShort                  // copy remaining 0..63 bytes and done
 652
 653
 654 // Forward loop for medium length operands in which low four bits of %rsi == 1111
 655
 656 LMod15:
 657         movdqa  -15(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 658 1:                                      // loop over 64-byte chunks
 659         movdqa  1(%rsi,%rcx),%xmm1
 660         movdqa  17(%rsi,%rcx),%xmm2
 661         movdqa  33(%rsi,%rcx),%xmm3
 662         movdqa  49(%rsi,%rcx),%xmm4
 663
 664         movdqa  %xmm0,%xmm5
 665         movdqa  %xmm4,%xmm0
 666
 667         palignr $15,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 668         palignr $15,%xmm2,%xmm3
 669         palignr $15,%xmm1,%xmm2
 670         palignr $15,%xmm5,%xmm1
 671
 672         movdqa  %xmm1,(%rdi,%rcx)
 673         movdqa  %xmm2,16(%rdi,%rcx)
 674         movdqa  %xmm3,32(%rdi,%rcx)
 675         movdqa  %xmm4,48(%rdi,%rcx)
 676
 677         addq    $64,%rcx
 678         jnz     1b
 679
 680         jmp     LShort                  // copy remaining 0..63 bytes and done
 681
 682
 683 // Reverse moves.  These are not optimized as aggressively as their forward
 684 // counterparts, as they are only used with destructive overlap.
 685 //      rdx = length
 686 //      rsi = source ptr
 687 //      rdi = dest ptr
 688
 689 LReverse:
 690         addq    %rdx,%rsi               // point to end of strings
 691         addq    %rdx,%rdi
 692         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
 693         ja      LReverseNotShort        // yes
 694
 695 // Handle reverse short copies.
 696 //      edx = length (<= kShort)
 697 //      rsi = one byte past end of source
 698 //      rdi = one byte past end of dest
 699
 700 LReverseShort:
 701         movl    %edx,%ecx               // copy length
 702         shrl    $3,%ecx                 // #quadwords
 703         jz      3f
 704 1:
 705         subq    $8,%rsi
 706         movq    (%rsi),%rax
 707         subq    $8,%rdi
 708         movq    %rax,(%rdi)
 709         decl    %ecx
 710         jnz     1b
 711 3:
 712         andl    $7,%edx                 // bytes?
 713         jz      5f
 714 4:
 715         decq    %rsi
 716         movb    (%rsi),%al
 717         decq    %rdi
 718         movb    %al,(%rdi)
 719         decl    %edx
 720         jnz     4b
 721 5:
 722         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 723         popq    %rbp
 724         ret
 725
 726 // Handle a reverse move long enough to justify using SSE.
 727 //      rdx = length (> kShort)
 728 //      rsi = one byte past end of source
 729 //      rdi = one byte past end of dest
 730
 731 LReverseNotShort:
 732         movl    %edi,%ecx               // copy destination
 733         andl    $15,%ecx                // get #bytes to align destination
 734         je      LReverseDestAligned     // already aligned
 735         subq    %rcx,%rdx               // adjust length
 736 1:                                      // loop copying 1..15 bytes
 737         decq    %rsi
 738         movb    (%rsi),%al
 739         decq    %rdi
 740         movb    %al,(%rdi)
 741         decl    %ecx
 742         jnz     1b
 743
 744 // Destination is now aligned.  Prepare for reverse loops.
 745
 746 LReverseDestAligned:
 747         movq    %rdx,%rcx               // copy length
 748         andl    $63,%edx                // get remaining bytes for LReverseShort
 749         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 750         subq    %rcx,%rsi               // point to endpoint of copy
 751         subq    %rcx,%rdi
 752         testl   $15,%esi                // is source aligned too?
 753         jnz     LReverseUnalignedLoop   // no
 754
 755 LReverseAlignedLoop:                    // loop over 64-byte chunks
 756         movdqa  -16(%rsi,%rcx),%xmm0
 757         movdqa  -32(%rsi,%rcx),%xmm1
 758         movdqa  -48(%rsi,%rcx),%xmm2
 759         movdqa  -64(%rsi,%rcx),%xmm3
 760
 761         movdqa  %xmm0,-16(%rdi,%rcx)
 762         movdqa  %xmm1,-32(%rdi,%rcx)
 763         movdqa  %xmm2,-48(%rdi,%rcx)
 764         movdqa  %xmm3,-64(%rdi,%rcx)
 765
 766         subq    $64,%rcx
 767         jne     LReverseAlignedLoop
 768
 769         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 770
 771
 772 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 773
 774 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 775         movdqu  -16(%rsi,%rcx),%xmm0
 776         movdqu  -32(%rsi,%rcx),%xmm1
 777         movdqu  -48(%rsi,%rcx),%xmm2
 778         movdqu  -64(%rsi,%rcx),%xmm3
 779
 780         movdqa  %xmm0,-16(%rdi,%rcx)
 781         movdqa  %xmm1,-32(%rdi,%rcx)
 782         movdqa  %xmm2,-48(%rdi,%rcx)
 783         movdqa  %xmm3,-64(%rdi,%rcx)
 784
 785         subq    $64,%rcx
 786         jne     LReverseUnalignedLoop
 787
 788         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 789
 790
 791         COMMPAGE_DESCRIPTOR(bcopy_sse4_64,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)