x86_64/string/bcopy_sse3x.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include "platfunc.h"
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
  34  * Supplemental SSE3 and 64-byte cache lines.  This is the 64-bit version.
  35  *
  36  * The following #defines are tightly coupled to the u-architecture:
  37  */
  38
  39 #define kShort  80                      // too short to bother with SSE (must be >=80)
  40 #define kVeryLong   (500*1024)          // large enough for non-temporal stores (>=8192 and <2GB)
  41 #define kFastUCode  ((16*1024)-15)      // cutoff for microcode fastpath for "rep/movsl"
  42
  43 // void bcopy(const void *src, void *dst, size_t len);
  44
  45 PLATFUNC_FUNCTION_START_GENERIC(bcopy, sse3x, 64, 5)
  46 LZero:
  47         pushq   %rbp                    // set up a frame for backtraces
  48         movq    %rsp,%rbp
  49         movq    %rsi,%rax               // copy dest ptr
  50         movq    %rdi,%rsi               // xchange source and dest ptrs
  51         movq    %rax,%rdi
  52         subq    %rsi,%rax               // (dest - source)
  53         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  54         jb      LReverseIsland
  55         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  56         jbe     LShort                  // no
  57         jmp     LNotShort
  58
  59 //
  60 // void *memcpy(void *dst, const void *src, size_t len);
  61 // void *memmove(void *dst, const void *src, size_t len);
  62 //
  63
  64 PLATFUNC_FUNCTION_START_GENERIC(memcpy, sse3x, 64, 0)           // void *memcpy(void *dst, const void *src, size_t len)
  65 PLATFUNC_FUNCTION_START_GENERIC(memmove, sse3x, 64, 0)  // void *memmove(void *dst, const void *src, size_t len)
  66         pushq   %rbp                    // set up a frame for backtraces
  67         movq    %rsp,%rbp
  68         movq    %rdi,%r11               // save return value here
  69         movq    %rdi,%rax
  70         subq    %rsi,%rax               // (dest - source)
  71         cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
  72         jb      LReverseIsland
  73         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
  74         ja      LNotShort               // yes
  75
  76 // Handle short forward copies.  As the most common case, this is the fall-through path.
  77 //      rdx = length (<= kShort)
  78 //      rsi = source ptr
  79 //      rdi = dest ptr
  80
  81 LShort:
  82         movl    %edx,%ecx               // copy length using 32-bit operation
  83         shrl    $2,%ecx                 // get #doublewords
  84         jz      LLeftovers
  85 2:                                      // loop copying doublewords
  86         movl    (%rsi),%eax
  87         addq    $4,%rsi
  88         movl    %eax,(%rdi)
  89         addq    $4,%rdi
  90         decl    %ecx
  91         jnz     2b
  92 LLeftovers:                             // handle leftover bytes (0..3) in last word
  93         andl    $3,%edx                 // any leftover bytes?
  94         jz      5f
  95 4:                                      // loop copying bytes
  96         movb    (%rsi),%al
  97         incq    %rsi
  98         movb    %al,(%rdi)
  99         incq    %rdi
 100         decl    %edx
 101         jnz     4b
 102 5:
 103         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 104         popq    %rbp
 105         ret
 106
 107
 108 LReverseIsland:                         // keep the "jb" above a short branch...
 109         jmp     LReverse                // ...because reverse moves are uncommon
 110
 111
 112 // Handle forward moves that are long enough to justify use of SSE.
 113 // First, 16-byte align the destination.
 114 //      rdx = length (> kShort)
 115 //      rsi = source ptr
 116 //      rdi = dest ptr
 117
 118 LNotShort:
 119         cmpq    $(kVeryLong),%rdx       // long enough to justify heavyweight loops?
 120         jae     LVeryLong               // use very-long-operand path
 121         movl    %edi,%ecx               // copy low half of destination ptr
 122         negl    %ecx
 123         andl    $15,%ecx                // get #bytes to align destination
 124         jz      LDestAligned            // already aligned
 125         subl    %ecx,%edx               // decrement length
 126         rep                             // align destination
 127         movsb
 128
 129
 130 // Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
 131 // based on the alignment of the source.  All vector loads and stores are aligned.
 132 // Even though this means we have to shift and repack vectors, doing so is much faster
 133 // than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
 134 // there is at least one chunk.  When we enter the copy loops, the following registers
 135 // are set up:
 136 //      rdx = residual length (0..63)
 137 //      rcx = -(length to move), a multiple of 64 less than 2GB
 138 //      rsi = ptr to 1st source byte not to move (unaligned)
 139 //      rdi = ptr to 1st dest byte not to move (aligned)
 140
 141 LDestAligned:
 142         movq    %rdx,%rcx               // copy length
 143         movl    %esi,%eax               // copy low half of source address
 144         andl    $63,%edx                // get remaining bytes for LShort
 145         andl    $15,%eax                // mask to low 4 bits of source address
 146         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 147         leaq    LTable(%rip), %r8
 148         addq    %rcx,%rsi               // point to 1st byte not copied
 149         addq    %rcx,%rdi
 150         movl    (%r8,%rax,4),%eax       // get offset of routine
 151         negq    %rcx                    // now generate offset to 1st byte to be copied
 152         addq    %r8,%rax                // generate address of copy loop
 153         jmp     *%rax                   // enter copy loop, selected by source alignment
 154
 155         .align  2
 156 LTable:                                 // table of copy loop addresses
 157 // force generation of assembly-time constants. Otherwise assembler
 158 // creates subtractor relocations relative to first external symbol,
 159 // and this file has none
 160         .set LMod0Offset, LMod0 - LTable
 161         .set LMod1Offset, LMod1 - LTable
 162         .set LMod2Offset, LMod2 - LTable
 163         .set LMod3Offset, LMod3 - LTable
 164         .set LMod4Offset, LMod4 - LTable
 165         .set LMod5Offset, LMod5 - LTable
 166         .set LMod6Offset, LMod6 - LTable
 167         .set LMod7Offset, LMod7 - LTable
 168         .set LMod8Offset, LMod8 - LTable
 169         .set LMod9Offset, LMod9 - LTable
 170         .set LMod10Offset, LMod10 - LTable
 171         .set LMod11Offset, LMod11 - LTable
 172         .set LMod12Offset, LMod12 - LTable
 173         .set LMod13Offset, LMod13 - LTable
 174         .set LMod14Offset, LMod14 - LTable
 175         .set LMod15Offset, LMod15 - LTable
 176         .long LMod0Offset
 177         .long LMod1Offset
 178         .long LMod2Offset
 179         .long LMod3Offset
 180         .long LMod4Offset
 181         .long LMod5Offset
 182         .long LMod6Offset
 183         .long LMod7Offset
 184         .long LMod8Offset
 185         .long LMod9Offset
 186         .long LMod10Offset
 187         .long LMod11Offset
 188         .long LMod12Offset
 189         .long LMod13Offset
 190         .long LMod14Offset
 191         .long LMod15Offset
 192
 193
 194 // Very long forward moves.  These are at least several pages.  They are special cased
 195 // and aggressively optimized, not so much because they are common or useful, but
 196 // because they are subject to benchmark.  There isn't enough room for them in the
 197 // area reserved on the platfunc for bcopy, so we put them elsewhere.  We call
 198 // the longcopy routine using the normal ABI:
 199 //      rdi = dest
 200 //      rsi = source
 201 //      rdx = length (>= kVeryLong bytes)
 202
 203 LVeryLong:
 204         pushq   %r11                    // save return value
 205         call    _longcopy               // call very long operand routine
 206         popq    %rax                    // pop return value
 207         popq    %rbp
 208         ret
 209
 210
 211 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
 212 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
 213 // about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
 214 // avoids having to read destination cache lines that will be completely overwritten.
 215 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
 216 // we do not know if the destination is in cache or not.
 217
 218 Lfastpath:
 219         addq    %rcx,%rsi               // restore ptrs to 1st byte of source and dest
 220         addq    %rcx,%rdi
 221         negl    %ecx                    // make length positive (known to be < 2GB)
 222         orl     %edx,%ecx               // restore total #bytes remaining to move
 223         cld                             // we'll move forward
 224         shrl    $2,%ecx                 // compute #words to move
 225         rep                             // the u-code will optimize this
 226         movsl
 227         jmp     LLeftovers              // handle 0..3 leftover bytes
 228
 229
 230 // Forward loop for medium length operands in which low four bits of %rsi == 0000
 231
 232 LMod0:
 233         cmpl    $(-kFastUCode),%ecx     // %rcx == -length, where (length < kVeryLong)
 234         jle     Lfastpath               // long enough for fastpath in microcode
 235         jmp     1f
 236         .align  4,0x90                  // 16-byte align inner loops
 237 1:                                      // loop over 64-byte chunks
 238         movdqa  (%rsi,%rcx),%xmm0
 239         movdqa  16(%rsi,%rcx),%xmm1
 240         movdqa  32(%rsi,%rcx),%xmm2
 241         movdqa  48(%rsi,%rcx),%xmm3
 242
 243         movdqa  %xmm0,(%rdi,%rcx)
 244         movdqa  %xmm1,16(%rdi,%rcx)
 245         movdqa  %xmm2,32(%rdi,%rcx)
 246         movdqa  %xmm3,48(%rdi,%rcx)
 247
 248         addq    $64,%rcx
 249         jnz     1b
 250
 251         jmp     LShort                  // copy remaining 0..63 bytes and done
 252
 253
 254 // Forward loop for medium length operands in which low four bits of %rsi == 0001
 255
 256 LMod1:
 257         movdqa  -1(%rsi,%rcx),%xmm0     // prime the loop by loading 1st quadword
 258 1:                                      // loop over 64-byte chunks
 259         movdqa  15(%rsi,%rcx),%xmm1
 260         movdqa  31(%rsi,%rcx),%xmm2
 261         movdqa  47(%rsi,%rcx),%xmm3
 262         movdqa  63(%rsi,%rcx),%xmm4
 263
 264         movdqa  %xmm0,%xmm5
 265         movdqa  %xmm4,%xmm0
 266
 267         palignr $1,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 268         palignr $1,%xmm2,%xmm3
 269         palignr $1,%xmm1,%xmm2
 270         palignr $1,%xmm5,%xmm1
 271
 272         movdqa  %xmm1,(%rdi,%rcx)
 273         movdqa  %xmm2,16(%rdi,%rcx)
 274         movdqa  %xmm3,32(%rdi,%rcx)
 275         movdqa  %xmm4,48(%rdi,%rcx)
 276
 277         addq    $64,%rcx
 278         jnz     1b
 279
 280         jmp     LShort                  // copy remaining 0..63 bytes and done
 281
 282
 283 // Forward loop for medium length operands in which low four bits of %rsi == 0010
 284
 285 LMod2:
 286         movdqa  -2(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 287 1:                                      // loop over 64-byte chunks
 288         movdqa  14(%rsi,%rcx),%xmm1
 289         movdqa  30(%rsi,%rcx),%xmm2
 290         movdqa  46(%rsi,%rcx),%xmm3
 291         movdqa  62(%rsi,%rcx),%xmm4
 292
 293         movdqa  %xmm0,%xmm5
 294         movdqa  %xmm4,%xmm0
 295
 296         palignr $2,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 297         palignr $2,%xmm2,%xmm3
 298         palignr $2,%xmm1,%xmm2
 299         palignr $2,%xmm5,%xmm1
 300
 301         movdqa  %xmm1,(%rdi,%rcx)
 302         movdqa  %xmm2,16(%rdi,%rcx)
 303         movdqa  %xmm3,32(%rdi,%rcx)
 304         movdqa  %xmm4,48(%rdi,%rcx)
 305
 306         addq    $64,%rcx
 307         jnz     1b
 308
 309         jmp     LShort                  // copy remaining 0..63 bytes and done
 310
 311
 312 // Forward loop for medium length operands in which low four bits of %rsi == 0011
 313
 314 LMod3:
 315         movdqa  -3(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 316 1:                                      // loop over 64-byte chunks
 317         movdqa  13(%rsi,%rcx),%xmm1
 318         movdqa  29(%rsi,%rcx),%xmm2
 319         movdqa  45(%rsi,%rcx),%xmm3
 320         movdqa  61(%rsi,%rcx),%xmm4
 321
 322         movdqa  %xmm0,%xmm5
 323         movdqa  %xmm4,%xmm0
 324
 325         palignr $3,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 326         palignr $3,%xmm2,%xmm3
 327         palignr $3,%xmm1,%xmm2
 328         palignr $3,%xmm5,%xmm1
 329
 330         movdqa  %xmm1,(%rdi,%rcx)
 331         movdqa  %xmm2,16(%rdi,%rcx)
 332         movdqa  %xmm3,32(%rdi,%rcx)
 333         movdqa  %xmm4,48(%rdi,%rcx)
 334
 335         addq    $64,%rcx
 336         jnz     1b
 337
 338         jmp     LShort                  // copy remaining 0..63 bytes and done
 339
 340
 341 // Forward loop for medium length operands in which low four bits of %rsi == 0100
 342 // We use the float single data type in order to use "movss" to merge vectors.
 343
 344 LMod4:
 345         movaps  -4(%rsi,%rcx),%xmm0     // 4-byte aligned: prime the loop
 346         jmp     1f
 347         .align  4,0x90
 348 1:                                      // loop over 64-byte chunks
 349         movaps  12(%rsi,%rcx),%xmm1
 350         movaps  28(%rsi,%rcx),%xmm2
 351         movss   %xmm1,%xmm0             // copy low 4 bytes of source into destination
 352         pshufd  $(0x39),%xmm0,%xmm0     // rotate right 4 bytes (mask -- 00 11 10 01)
 353         movaps  44(%rsi,%rcx),%xmm3
 354         movss   %xmm2,%xmm1
 355         pshufd  $(0x39),%xmm1,%xmm1
 356         movaps  60(%rsi,%rcx),%xmm4
 357         movss   %xmm3,%xmm2
 358         pshufd  $(0x39),%xmm2,%xmm2
 359
 360         movaps  %xmm0,(%rdi,%rcx)
 361         movss   %xmm4,%xmm3
 362         pshufd  $(0x39),%xmm3,%xmm3
 363         movaps  %xmm1,16(%rdi,%rcx)
 364         movaps  %xmm2,32(%rdi,%rcx)
 365         movaps  %xmm4,%xmm0
 366         movaps  %xmm3,48(%rdi,%rcx)
 367
 368         addq    $64,%rcx
 369         jnz     1b
 370
 371         jmp     LShort                  // copy remaining 0..63 bytes and done
 372
 373
 374 // Forward loop for medium length operands in which low four bits of %rsi == 0101
 375
 376 LMod5:
 377         movdqa  -5(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 378 1:                                      // loop over 64-byte chunks
 379         movdqa  11(%rsi,%rcx),%xmm1
 380         movdqa  27(%rsi,%rcx),%xmm2
 381         movdqa  43(%rsi,%rcx),%xmm3
 382         movdqa  59(%rsi,%rcx),%xmm4
 383
 384         movdqa  %xmm0,%xmm5
 385         movdqa  %xmm4,%xmm0
 386
 387         palignr $5,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 388         palignr $5,%xmm2,%xmm3
 389         palignr $5,%xmm1,%xmm2
 390         palignr $5,%xmm5,%xmm1
 391
 392         movdqa  %xmm1,(%rdi,%rcx)
 393         movdqa  %xmm2,16(%rdi,%rcx)
 394         movdqa  %xmm3,32(%rdi,%rcx)
 395         movdqa  %xmm4,48(%rdi,%rcx)
 396
 397         addq    $64,%rcx
 398         jnz     1b
 399
 400         jmp     LShort                  // copy remaining 0..63 bytes and done
 401
 402
 403 // Forward loop for medium length operands in which low four bits of %rsi == 0110
 404
 405 LMod6:
 406         movdqa  -6(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 407 1:                                      // loop over 64-byte chunks
 408         movdqa  10(%rsi,%rcx),%xmm1
 409         movdqa  26(%rsi,%rcx),%xmm2
 410         movdqa  42(%rsi,%rcx),%xmm3
 411         movdqa  58(%rsi,%rcx),%xmm4
 412
 413         movdqa  %xmm0,%xmm5
 414         movdqa  %xmm4,%xmm0
 415
 416         palignr $6,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 417         palignr $6,%xmm2,%xmm3
 418         palignr $6,%xmm1,%xmm2
 419         palignr $6,%xmm5,%xmm1
 420
 421         movdqa  %xmm1,(%rdi,%rcx)
 422         movdqa  %xmm2,16(%rdi,%rcx)
 423         movdqa  %xmm3,32(%rdi,%rcx)
 424         movdqa  %xmm4,48(%rdi,%rcx)
 425
 426         addq    $64,%rcx
 427         jnz     1b
 428
 429         jmp     LShort                  // copy remaining 0..63 bytes and done
 430
 431
 432 // Forward loop for medium length operands in which low four bits of %rsi == 0111
 433
 434 LMod7:
 435         movdqa  -7(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 436 1:                                      // loop over 64-byte chunks
 437         movdqa  9(%rsi,%rcx),%xmm1
 438         movdqa  25(%rsi,%rcx),%xmm2
 439         movdqa  41(%rsi,%rcx),%xmm3
 440         movdqa  57(%rsi,%rcx),%xmm4
 441
 442         movdqa  %xmm0,%xmm5
 443         movdqa  %xmm4,%xmm0
 444
 445         palignr $7,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 446         palignr $7,%xmm2,%xmm3
 447         palignr $7,%xmm1,%xmm2
 448         palignr $7,%xmm5,%xmm1
 449
 450         movdqa  %xmm1,(%rdi,%rcx)
 451         movdqa  %xmm2,16(%rdi,%rcx)
 452         movdqa  %xmm3,32(%rdi,%rcx)
 453         movdqa  %xmm4,48(%rdi,%rcx)
 454
 455         addq    $64,%rcx
 456         jnz     1b
 457
 458         jmp     LShort                  // copy remaining 0..63 bytes and done
 459
 460
 461 // Forward loop for medium length operands in which low four bits of %rsi == 1000
 462 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
 463
 464 LMod8:
 465         cmpl    $(-kFastUCode),%ecx     // %rcx == -length, where (length < kVeryLong)
 466         jle     Lfastpath               // long enough for fastpath in microcode
 467         movapd  -8(%rsi,%rcx),%xmm0     // 8-byte aligned: prime the loop
 468         jmp     1f
 469         .align  4,0x90
 470 1:                                      // loop over 64-byte chunks
 471         movapd  8(%rsi,%rcx),%xmm1
 472         movapd  24(%rsi,%rcx),%xmm2
 473         shufpd  $01,%xmm1,%xmm0         // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
 474         movapd  40(%rsi,%rcx),%xmm3
 475         shufpd  $01,%xmm2,%xmm1
 476         movapd  56(%rsi,%rcx),%xmm4
 477         shufpd  $01,%xmm3,%xmm2
 478
 479         movapd  %xmm0,(%rdi,%rcx)
 480         shufpd  $01,%xmm4,%xmm3
 481         movapd  %xmm1,16(%rdi,%rcx)
 482         movapd  %xmm2,32(%rdi,%rcx)
 483         movapd  %xmm4,%xmm0
 484         movapd  %xmm3,48(%rdi,%rcx)
 485
 486         addq    $64,%rcx
 487         jnz     1b
 488
 489         jmp     LShort                  // copy remaining 0..63 bytes and done
 490
 491
 492 // Forward loop for medium length operands in which low four bits of %rsi == 1001
 493
 494 LMod9:
 495         movdqa  -9(%rsi,%rcx),%xmm0     // prime the loop by loading 1st source dq
 496 1:                                      // loop over 64-byte chunks
 497         movdqa  7(%rsi,%rcx),%xmm1
 498         movdqa  23(%rsi,%rcx),%xmm2
 499         movdqa  39(%rsi,%rcx),%xmm3
 500         movdqa  55(%rsi,%rcx),%xmm4
 501
 502         movdqa  %xmm0,%xmm5
 503         movdqa  %xmm4,%xmm0
 504
 505         palignr $9,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 506         palignr $9,%xmm2,%xmm3
 507         palignr $9,%xmm1,%xmm2
 508         palignr $9,%xmm5,%xmm1
 509
 510         movdqa  %xmm1,(%rdi,%rcx)
 511         movdqa  %xmm2,16(%rdi,%rcx)
 512         movdqa  %xmm3,32(%rdi,%rcx)
 513         movdqa  %xmm4,48(%rdi,%rcx)
 514
 515         addq    $64,%rcx
 516         jnz     1b
 517
 518         jmp     LShort                  // copy remaining 0..63 bytes and done
 519
 520
 521 // Forward loop for medium length operands in which low four bits of %rsi == 1010
 522
 523 LMod10:
 524         movdqa  -10(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 525 1:                                      // loop over 64-byte chunks
 526         movdqa  6(%rsi,%rcx),%xmm1
 527         movdqa  22(%rsi,%rcx),%xmm2
 528         movdqa  38(%rsi,%rcx),%xmm3
 529         movdqa  54(%rsi,%rcx),%xmm4
 530
 531         movdqa  %xmm0,%xmm5
 532         movdqa  %xmm4,%xmm0
 533
 534         palignr $10,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 535         palignr $10,%xmm2,%xmm3
 536         palignr $10,%xmm1,%xmm2
 537         palignr $10,%xmm5,%xmm1
 538
 539         movdqa  %xmm1,(%rdi,%rcx)
 540         movdqa  %xmm2,16(%rdi,%rcx)
 541         movdqa  %xmm3,32(%rdi,%rcx)
 542         movdqa  %xmm4,48(%rdi,%rcx)
 543
 544         addq    $64,%rcx
 545         jnz     1b
 546
 547         jmp     LShort                  // copy remaining 0..63 bytes and done
 548
 549
 550 // Forward loop for medium length operands in which low four bits of %rsi == 1011
 551
 552 LMod11:
 553         movdqa  -11(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 554 1:                                      // loop over 64-byte chunks
 555         movdqa  5(%rsi,%rcx),%xmm1
 556         movdqa  21(%rsi,%rcx),%xmm2
 557         movdqa  37(%rsi,%rcx),%xmm3
 558         movdqa  53(%rsi,%rcx),%xmm4
 559
 560         movdqa  %xmm0,%xmm5
 561         movdqa  %xmm4,%xmm0
 562
 563         palignr $11,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 564         palignr $11,%xmm2,%xmm3
 565         palignr $11,%xmm1,%xmm2
 566         palignr $11,%xmm5,%xmm1
 567
 568         movdqa  %xmm1,(%rdi,%rcx)
 569         movdqa  %xmm2,16(%rdi,%rcx)
 570         movdqa  %xmm3,32(%rdi,%rcx)
 571         movdqa  %xmm4,48(%rdi,%rcx)
 572
 573         addq    $64,%rcx
 574         jnz     1b
 575
 576         jmp     LShort                  // copy remaining 0..63 bytes and done
 577
 578
 579 // Forward loop for medium length operands in which low four bits of %rsi == 1100
 580 // We use the float single data type in order to use "movss" to merge vectors.
 581
 582 LMod12:
 583         movss   (%rsi,%rcx),%xmm0       // prefetch 1st four bytes of source, right justified
 584         jmp     1f
 585         .align  4,0x90
 586 1:                                      // loop over 64-byte chunks
 587         pshufd  $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
 588         pshufd  $(0x93),20(%rsi,%rcx),%xmm2
 589         pshufd  $(0x93),36(%rsi,%rcx),%xmm3
 590         pshufd  $(0x93),52(%rsi,%rcx),%xmm4
 591
 592         movaps  %xmm4,%xmm5
 593         movss   %xmm3,%xmm4             // copy low 4 bytes of source into destination
 594         movss   %xmm2,%xmm3
 595         movss   %xmm1,%xmm2
 596         movss   %xmm0,%xmm1
 597
 598         movaps  %xmm1,(%rdi,%rcx)
 599         movaps  %xmm2,16(%rdi,%rcx)
 600         movaps  %xmm5,%xmm0
 601         movaps  %xmm3,32(%rdi,%rcx)
 602         movaps  %xmm4,48(%rdi,%rcx)
 603
 604         addq    $64,%rcx
 605         jnz     1b
 606
 607         jmp     LShort                  // copy remaining 0..63 bytes and done
 608
 609
 610 // Forward loop for medium length operands in which low four bits of %rsi == 1101
 611
 612 LMod13:
 613         movdqa  -13(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 614 1:                                      // loop over 64-byte chunks
 615         movdqa  3(%rsi,%rcx),%xmm1
 616         movdqa  19(%rsi,%rcx),%xmm2
 617         movdqa  35(%rsi,%rcx),%xmm3
 618         movdqa  51(%rsi,%rcx),%xmm4
 619
 620         movdqa  %xmm0,%xmm5
 621         movdqa  %xmm4,%xmm0
 622
 623         palignr $13,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 624         palignr $13,%xmm2,%xmm3
 625         palignr $13,%xmm1,%xmm2
 626         palignr $13,%xmm5,%xmm1
 627
 628         movdqa  %xmm1,(%rdi,%rcx)
 629         movdqa  %xmm2,16(%rdi,%rcx)
 630         movdqa  %xmm3,32(%rdi,%rcx)
 631         movdqa  %xmm4,48(%rdi,%rcx)
 632
 633         addq    $64,%rcx
 634         jnz     1b
 635
 636         jmp     LShort                  // copy remaining 0..63 bytes and done
 637
 638
 639 // Forward loop for medium length operands in which low four bits of %rsi == 1110
 640
 641 LMod14:
 642         movdqa  -14(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 643 1:                                      // loop over 64-byte chunks
 644         movdqa  2(%rsi,%rcx),%xmm1
 645         movdqa  18(%rsi,%rcx),%xmm2
 646         movdqa  34(%rsi,%rcx),%xmm3
 647         movdqa  50(%rsi,%rcx),%xmm4
 648
 649         movdqa  %xmm0,%xmm5
 650         movdqa  %xmm4,%xmm0
 651
 652         palignr $14,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 653         palignr $14,%xmm2,%xmm3
 654         palignr $14,%xmm1,%xmm2
 655         palignr $14,%xmm5,%xmm1
 656
 657         movdqa  %xmm1,(%rdi,%rcx)
 658         movdqa  %xmm2,16(%rdi,%rcx)
 659         movdqa  %xmm3,32(%rdi,%rcx)
 660         movdqa  %xmm4,48(%rdi,%rcx)
 661
 662         addq    $64,%rcx
 663         jnz     1b
 664
 665         jmp     LShort                  // copy remaining 0..63 bytes and done
 666
 667
 668 // Forward loop for medium length operands in which low four bits of %rsi == 1111
 669
 670 LMod15:
 671         movdqa  -15(%rsi,%rcx),%xmm0    // prime the loop by loading 1st source dq
 672 1:                                      // loop over 64-byte chunks
 673         movdqa  1(%rsi,%rcx),%xmm1
 674         movdqa  17(%rsi,%rcx),%xmm2
 675         movdqa  33(%rsi,%rcx),%xmm3
 676         movdqa  49(%rsi,%rcx),%xmm4
 677
 678         movdqa  %xmm0,%xmm5
 679         movdqa  %xmm4,%xmm0
 680
 681         palignr $15,%xmm3,%xmm4         // dest <- shr( dest || source, imm*8 )
 682         palignr $15,%xmm2,%xmm3
 683         palignr $15,%xmm1,%xmm2
 684         palignr $15,%xmm5,%xmm1
 685
 686         movdqa  %xmm1,(%rdi,%rcx)
 687         movdqa  %xmm2,16(%rdi,%rcx)
 688         movdqa  %xmm3,32(%rdi,%rcx)
 689         movdqa  %xmm4,48(%rdi,%rcx)
 690
 691         addq    $64,%rcx
 692         jnz     1b
 693
 694         jmp     LShort                  // copy remaining 0..63 bytes and done
 695
 696
 697 // Reverse moves.  These are not optimized as aggressively as their forward
 698 // counterparts, as they are only used with destructive overlap.
 699 //      rdx = length
 700 //      rsi = source ptr
 701 //      rdi = dest ptr
 702
 703 LReverse:
 704         addq    %rdx,%rsi               // point to end of strings
 705         addq    %rdx,%rdi
 706         cmpq    $(kShort),%rdx          // long enough to bother with SSE?
 707         ja      LReverseNotShort        // yes
 708
 709 // Handle reverse short copies.
 710 //      edx = length (<= kShort)
 711 //      rsi = one byte past end of source
 712 //      rdi = one byte past end of dest
 713
 714 LReverseShort:
 715         movl    %edx,%ecx               // copy length
 716         shrl    $3,%ecx                 // #quadwords
 717         jz      3f
 718 1:
 719         subq    $8,%rsi
 720         movq    (%rsi),%rax
 721         subq    $8,%rdi
 722         movq    %rax,(%rdi)
 723         decl    %ecx
 724         jnz     1b
 725 3:
 726         andl    $7,%edx                 // bytes?
 727         jz      5f
 728 4:
 729         decq    %rsi
 730         movb    (%rsi),%al
 731         decq    %rdi
 732         movb    %al,(%rdi)
 733         decl    %edx
 734         jnz     4b
 735 5:
 736         movq    %r11,%rax               // get return value (dst ptr) for memcpy/memmove
 737         popq    %rbp
 738         ret
 739
 740 // Handle a reverse move long enough to justify using SSE.
 741 //      rdx = length (> kShort)
 742 //      rsi = one byte past end of source
 743 //      rdi = one byte past end of dest
 744
 745 LReverseNotShort:
 746         movl    %edi,%ecx               // copy destination
 747         andl    $15,%ecx                // get #bytes to align destination
 748         je      LReverseDestAligned     // already aligned
 749         subq    %rcx,%rdx               // adjust length
 750 1:                                      // loop copying 1..15 bytes
 751         decq    %rsi
 752         movb    (%rsi),%al
 753         decq    %rdi
 754         movb    %al,(%rdi)
 755         decl    %ecx
 756         jnz     1b
 757
 758 // Destination is now aligned.  Prepare for reverse loops.
 759
 760 LReverseDestAligned:
 761         movq    %rdx,%rcx               // copy length
 762         andl    $63,%edx                // get remaining bytes for LReverseShort
 763         andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 764         subq    %rcx,%rsi               // point to endpoint of copy
 765         subq    %rcx,%rdi
 766         testl   $15,%esi                // is source aligned too?
 767         jnz     LReverseUnalignedLoop   // no
 768
 769 LReverseAlignedLoop:                    // loop over 64-byte chunks
 770         movdqa  -16(%rsi,%rcx),%xmm0
 771         movdqa  -32(%rsi,%rcx),%xmm1
 772         movdqa  -48(%rsi,%rcx),%xmm2
 773         movdqa  -64(%rsi,%rcx),%xmm3
 774
 775         movdqa  %xmm0,-16(%rdi,%rcx)
 776         movdqa  %xmm1,-32(%rdi,%rcx)
 777         movdqa  %xmm2,-48(%rdi,%rcx)
 778         movdqa  %xmm3,-64(%rdi,%rcx)
 779
 780         subq    $64,%rcx
 781         jne     LReverseAlignedLoop
 782
 783         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 784
 785
 786 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 787
 788 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 789         movdqu  -16(%rsi,%rcx),%xmm0
 790         movdqu  -32(%rsi,%rcx),%xmm1
 791         movdqu  -48(%rsi,%rcx),%xmm2
 792         movdqu  -64(%rsi,%rcx),%xmm3
 793
 794         movdqa  %xmm0,-16(%rdi,%rcx)
 795         movdqa  %xmm1,-32(%rdi,%rcx)
 796         movdqa  %xmm2,-48(%rdi,%rcx)
 797         movdqa  %xmm3,-64(%rdi,%rcx)
 798
 799         subq    $64,%rcx
 800         jne     LReverseUnalignedLoop
 801
 802         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 803
 804 PLATFUNC_DESCRIPTOR(bcopy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
 805 PLATFUNC_DESCRIPTOR(memcpy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
 806 PLATFUNC_DESCRIPTOR(memmove,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)