i386/string/bcopy_sse3x.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <platfunc.h>
  31
  32 /*
  33  * The bcopy/memcpy loops, tuned for Pentium-M class processors with
  34  * Supplemental SSE3 and 64-byte cache lines.
  35  *
  36  * The following #defines are tightly coupled to the u-architecture:
  37  */
  38
  39 #define kShort  80                      // too short to bother with SSE (must be >=80)
  40 #define kVeryLong   (500*1024)          // large enough for non-temporal stores (must be >= 8192)
  41 #define kFastUCode  ((16*1024)-15)      // cutoff for microcode fastpath for "rep/movsl"
  42
  43 // void bcopy(const void *src, void *dst, size_t len);
  44
  45 PLATFUNC_FUNCTION_START(bcopy, sse3x, 32, 5)
  46         pushl   %ebp                    // set up a frame for backtraces
  47         movl    %esp,%ebp
  48         pushl   %esi
  49         pushl   %edi
  50         pushl   %ebx
  51         movl    8(%ebp),%esi            // get source ptr
  52         movl    12(%ebp),%edi           // get dest ptr
  53         movl    16(%ebp),%ecx           // get length
  54         movl    %edi,%edx
  55         subl    %esi,%edx               // (dest - source)
  56         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  57         jb      LReverseIsland
  58         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  59         jbe     Lshort                  // no
  60         jmp     LNotShort
  61
  62 //
  63 // void *memcpy(void *dst, const void *src, size_t len);
  64 // void *memmove(void *dst, const void *src, size_t len);
  65 //
  66
  67 PLATFUNC_FUNCTION_START(memcpy, sse3x, 32, 0)   // void *memcpy(void *dst, const void *src, size_t len)
  68 PLATFUNC_FUNCTION_START(memmove, sse3x, 32, 0)  // void *memmove(void *dst, const void *src, size_t len)
  69         pushl   %ebp                            // set up a frame for backtraces
  70         movl    %esp,%ebp
  71         pushl   %esi
  72         pushl   %edi
  73         pushl   %ebx
  74         movl    8(%ebp),%edi            // get dest ptr
  75         movl    12(%ebp),%esi           // get source ptr
  76         movl    16(%ebp),%ecx           // get length
  77         movl    %edi,%edx
  78         subl    %esi,%edx               // (dest - source)
  79         cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
  80         jb      LReverseIsland
  81         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
  82         ja      LNotShort               // yes
  83
  84 // Handle short forward copies.  As the most common case, this is the fall-through path.
  85 //      ecx = length (<= kShort)
  86 //      esi = source ptr
  87 //      edi = dest ptr
  88
  89 Lshort:
  90         movl    %ecx,%edx               // copy length
  91         shrl    $2,%ecx                 // get #doublewords
  92         jz      LLeftovers
  93 2:                                      // loop copying doublewords
  94         movl    (%esi),%eax
  95         addl    $4,%esi
  96         movl    %eax,(%edi)
  97         addl    $4,%edi
  98         dec     %ecx
  99         jnz     2b
 100 LLeftovers:                             // handle leftover bytes (0..3) in last word
 101         andl    $3,%edx                 // any leftover bytes?
 102         jz      Lexit
 103 4:                                      // loop copying bytes
 104         movb    (%esi),%al
 105         inc     %esi
 106         movb    %al,(%edi)
 107         inc     %edi
 108         dec     %edx
 109         jnz     4b
 110 Lexit:
 111         movl    8(%ebp),%eax            // get return value (dst ptr) for memcpy/memmove
 112         popl    %ebx
 113         popl    %edi
 114         popl    %esi
 115         popl    %ebp
 116         ret
 117
 118
 119 LReverseIsland:                         // keep the "jb" above a short branch...
 120         jmp     LReverse                // ...because reverse moves are uncommon
 121
 122
 123 // Handle forward moves that are long enough to justify use of SSE3.
 124 // First, 16-byte align the destination.
 125 //      ecx = length (> kShort)
 126 //      esi = source ptr
 127 //      edi = dest ptr
 128
 129 LNotShort:
 130         cmpl    $(kVeryLong),%ecx       // long enough to justify heavyweight loops?
 131         movl    %edi,%edx               // copy destination
 132         jae     LVeryLong               // use very-long-operand path
 133         negl    %edx
 134         andl    $15,%edx                // get #bytes to align destination
 135         jz      LDestAligned            // already aligned
 136         subl    %edx,%ecx               // decrement length
 137 1:                                      // loop copying 1..15 bytes
 138         movb    (%esi),%al
 139         inc     %esi
 140         movb    %al,(%edi)
 141         inc     %edi
 142         dec     %edx
 143         jnz     1b
 144
 145 // Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
 146 // based on the alignment of the source.  All vector loads and stores are aligned.
 147 // Even though this means we have to shift and repack vectors, doing so is much faster
 148 // than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
 149 // there is at least one chunk.  When we enter the copy loops, the following registers
 150 // are set up:
 151 //      ecx = residual length (0..63)
 152 //      edx = -(length to move), a multiple of 64
 153 //      esi = ptr to 1st source byte not to move (unaligned)
 154 //      edi = ptr to 1st dest byte not to move (aligned)
 155
 156 LDestAligned:
 157         movl    %ecx,%edx               // copy length
 158         movl    %esi,%eax               // copy source address
 159         andl    $63,%ecx                // get remaining bytes for Lshort
 160         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 161         andl    $15,%eax                // mask to low 4 bits of source address
 162         addl    %edx,%esi               // point to 1st byte not copied
 163         addl    %edx,%edi
 164         negl    %edx                    // now generate offset to 1st byte to be copied
 165         call    1f
 166 1:
 167         popl    %ebx
 168         movl    (LTable-1b)(%ebx,%eax,4), %eax // load jump table entry address, relative to LZero
 169         leal    (LTable-1b)(%ebx,%eax,1), %eax
 170         jmp     *%eax
 171
 172         .align  2
 173 LTable:                                 // table of copy loop addresses
 174         .long LMod0 -LTable
 175         .long LMod1 -LTable
 176         .long LMod2 -LTable
 177         .long LMod3 -LTable
 178         .long LMod4 -LTable
 179         .long LMod5 -LTable
 180         .long LMod6 -LTable
 181         .long LMod7 -LTable
 182         .long LMod8 -LTable
 183         .long LMod9 -LTable
 184         .long LMod10 -LTable
 185         .long LMod11 -LTable
 186         .long LMod12 -LTable
 187         .long LMod13 -LTable
 188         .long LMod14 -LTable
 189         .long LMod15 -LTable
 190
 191
 192 // Very long forward moves.  These are at least several pages.  They are special cased
 193 // and aggressively optimized, not so much because they are common or useful, but
 194 // because they are subject to benchmark.  There isn't enough room for them in the
 195 // area reserved on the platfunc for bcopy, so we put them elsewhere.  We call
 196 // the longcopy routine using the normal ABI.
 197
 198 LVeryLong:
 199         pushl   %ecx                    // length (>= kVeryLong)
 200         pushl   %esi                    // source ptr
 201         pushl   %edi                    // dest ptr
 202         call    _longcopy
 203         addl    $12,%esp                // pop off our parameters
 204         jmp     Lexit
 205
 206
 207 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
 208 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
 209 // about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
 210 // avoids having to read destination cache lines that will be completely overwritten.
 211 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
 212 // we do not know if the destination is in cache or not.
 213
 214 Lfastpath:
 215         addl    %edx,%esi               // restore ptrs to 1st byte of source and dest
 216         addl    %edx,%edi
 217         negl    %edx                    // make length positive
 218         orl     %edx,%ecx               // restore total #bytes remaining to move
 219         cld                             // we'll move forward
 220         movl    %ecx,%edx               // copy total length to move
 221         shrl    $2,%ecx                 // compute #words to move
 222         rep                             // the u-code will optimize this
 223         movsl
 224         jmp     LLeftovers              // handle 0..3 leftover bytes
 225
 226
 227 // Forward loop for medium length operands in which low four bits of %esi == 0000
 228
 229 LMod0:
 230         cmpl    $(-kFastUCode),%edx     // %edx == -length, where (length < kVeryLong)
 231         jle     Lfastpath               // long enough for fastpath in microcode
 232         jmp     1f
 233         .align  4,0x90                  // 16-byte align inner loops
 234 1:                                      // loop over 64-byte chunks
 235         movdqa  (%esi,%edx),%xmm0
 236         movdqa  16(%esi,%edx),%xmm1
 237         movdqa  32(%esi,%edx),%xmm2
 238         movdqa  48(%esi,%edx),%xmm3
 239
 240         movdqa  %xmm0,(%edi,%edx)
 241         movdqa  %xmm1,16(%edi,%edx)
 242         movdqa  %xmm2,32(%edi,%edx)
 243         movdqa  %xmm3,48(%edi,%edx)
 244
 245         addl    $64,%edx
 246         jnz     1b
 247
 248         jmp     Lshort                  // copy remaining 0..63 bytes and done
 249
 250
 251 // Forward loop for medium length operands in which low four bits of %esi == 0001
 252
 253 LMod1:
 254         movdqa  -1(%esi,%edx),%xmm0     // prime the loop by loading 1st quadword
 255 1:                                      // loop over 64-byte chunks
 256         movdqa  15(%esi,%edx),%xmm1
 257         movdqa  31(%esi,%edx),%xmm2
 258         movdqa  47(%esi,%edx),%xmm3
 259         movdqa  63(%esi,%edx),%xmm4
 260
 261         movdqa  %xmm0,%xmm5
 262         movdqa  %xmm4,%xmm0
 263
 264         palignr $1,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 265         palignr $1,%xmm2,%xmm3
 266         palignr $1,%xmm1,%xmm2
 267         palignr $1,%xmm5,%xmm1
 268
 269         movdqa  %xmm1,(%edi,%edx)
 270         movdqa  %xmm2,16(%edi,%edx)
 271         movdqa  %xmm3,32(%edi,%edx)
 272         movdqa  %xmm4,48(%edi,%edx)
 273
 274         addl    $64,%edx
 275         jnz     1b
 276
 277         jmp     Lshort                  // copy remaining 0..63 bytes and done
 278
 279
 280 // Forward loop for medium length operands in which low four bits of %esi == 0010
 281
 282 LMod2:
 283         movdqa  -2(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 284 1:                                      // loop over 64-byte chunks
 285         movdqa  14(%esi,%edx),%xmm1
 286         movdqa  30(%esi,%edx),%xmm2
 287         movdqa  46(%esi,%edx),%xmm3
 288         movdqa  62(%esi,%edx),%xmm4
 289
 290         movdqa  %xmm0,%xmm5
 291         movdqa  %xmm4,%xmm0
 292
 293         palignr $2,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 294         palignr $2,%xmm2,%xmm3
 295         palignr $2,%xmm1,%xmm2
 296         palignr $2,%xmm5,%xmm1
 297
 298         movdqa  %xmm1,(%edi,%edx)
 299         movdqa  %xmm2,16(%edi,%edx)
 300         movdqa  %xmm3,32(%edi,%edx)
 301         movdqa  %xmm4,48(%edi,%edx)
 302
 303         addl    $64,%edx
 304         jnz     1b
 305
 306         jmp     Lshort                  // copy remaining 0..63 bytes and done
 307
 308
 309 // Forward loop for medium length operands in which low four bits of %esi == 0011
 310
 311 LMod3:
 312         movdqa  -3(%esi,%edx),%xmm0     // prime the loop by loading 1st source dq
 313 1:                                      // loop over 64-byte chunks
 314         movdqa  13(%esi,%edx),%xmm1
 315         movdqa  29(%esi,%edx),%xmm2
 316         movdqa  45(%esi,%edx),%xmm3
 317         movdqa  61(%esi,%edx),%xmm4
 318
 319         movdqa  %xmm0,%xmm5
 320         movdqa  %xmm4,%xmm0
 321
 322         palignr $3,%xmm3,%xmm4          // dest <- shr( dest || source, imm*8 )
 323         palignr $3,%xmm2,%xmm3
 324         palignr $3,%xmm1,%xmm2
 325         palignr $3,%xmm5,%xmm1
 326
 327         movdqa  %xmm1,(%edi,%edx)
 328         movdqa  %xmm2,16(%edi,%edx)
 329         movdqa  %xmm3,32(%edi,%edx)
 330         movdqa  %xmm4,48(%edi,%edx)
 331
 332         addl    $64,%edx
 333         jnz     1b
 334
 335         jmp     Lshort                  // copy remaining 0..63 bytes and done
 336
 337
 338 // Forward loop for medium length operands in which low four bits of %esi == 0100
 339 // We use the float single data type in order to use "movss" to merge vectors.
 340
 341 LMod4:
 342         movaps  -4(%esi,%edx),%xmm0     // 4-byte aligned: prime the loop
 343         jmp     1f
 344         .align  4,0x90
 345 1:                                      // loop over 64-byte chunks
 346         movaps  12(%esi,%edx),%xmm1
 347         movaps  28(%esi,%edx),%xmm2
 348         movss   %xmm1,%xmm0             // copy low 4 bytes of source into destination
 349         pshufd  $(0x39),%xmm0,%xmm0     // rotate right 4 bytes (mask -- 00 11 10 01)
 350         movaps  44(%esi,%edx),%xmm3
 351         movss   %xmm2,%xmm1
 352         pshufd  $(0x39),%xmm1,%xmm1
 353         movaps  60(%esi,%edx),%xmm4
 354         movss   %xmm3,%xmm2
 355         pshufd  $(0x39),%xmm2,%xmm2
 356
 357         movaps  %xmm0,(%edi,%edx)
 358         movss   %xmm4,%xmm3
 359         pshufd  $(0x39),%xmm3,%xmm3
 360         movaps  %xmm1,16(%edi,%edx)
 361         movaps  %xmm2,32(%edi,%edx)
 362         movaps  %xmm4,%xmm0
 363         movaps  %xmm3,48(%edi,%edx)
 364
 365         addl    $64,%edx
 366         jnz     1b
 367
 368         jmp     Lshort                  // copy remaining 0..63 bytes and done
 369
 370
 371 // Forward loop for medium length operands in which low four bits of %esi == 0101
 372
 373 LMod5:
 374         movdqa  -5(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
 375 1:                              // loop over 64-byte chunks
 376         movdqa  11(%esi,%edx),%xmm1
 377         movdqa  27(%esi,%edx),%xmm2
 378         movdqa  43(%esi,%edx),%xmm3
 379         movdqa  59(%esi,%edx),%xmm4
 380
 381         movdqa  %xmm0,%xmm5
 382         movdqa  %xmm4,%xmm0
 383
 384         palignr $5,%xmm3,%xmm4  // dest <- shr( dest || source, imm*8 )
 385         palignr $5,%xmm2,%xmm3
 386         palignr $5,%xmm1,%xmm2
 387         palignr $5,%xmm5,%xmm1
 388
 389         movdqa  %xmm1,(%edi,%edx)
 390         movdqa  %xmm2,16(%edi,%edx)
 391         movdqa  %xmm3,32(%edi,%edx)
 392         movdqa  %xmm4,48(%edi,%edx)
 393
 394         addl    $64,%edx
 395         jnz     1b
 396
 397         jmp     Lshort                  // copy remaining 0..63 bytes and done
 398
 399
 400 // Forward loop for medium length operands in which low four bits of %esi == 0110
 401
 402 LMod6:
 403         movdqa  -6(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
 404 1:                              // loop over 64-byte chunks
 405         movdqa  10(%esi,%edx),%xmm1
 406         movdqa  26(%esi,%edx),%xmm2
 407         movdqa  42(%esi,%edx),%xmm3
 408         movdqa  58(%esi,%edx),%xmm4
 409
 410         movdqa  %xmm0,%xmm5
 411         movdqa  %xmm4,%xmm0
 412
 413         palignr $6,%xmm3,%xmm4  // dest <- shr( dest || source, imm*8 )
 414         palignr $6,%xmm2,%xmm3
 415         palignr $6,%xmm1,%xmm2
 416         palignr $6,%xmm5,%xmm1
 417
 418         movdqa  %xmm1,(%edi,%edx)
 419         movdqa  %xmm2,16(%edi,%edx)
 420         movdqa  %xmm3,32(%edi,%edx)
 421         movdqa  %xmm4,48(%edi,%edx)
 422
 423         addl    $64,%edx
 424         jnz     1b
 425
 426         jmp     Lshort                  // copy remaining 0..63 bytes and done
 427
 428
 429 // Forward loop for medium length operands in which low four bits of %esi == 0111
 430
 431 LMod7:
 432         movdqa  -7(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
 433 1:                              // loop over 64-byte chunks
 434         movdqa  9(%esi,%edx),%xmm1
 435         movdqa  25(%esi,%edx),%xmm2
 436         movdqa  41(%esi,%edx),%xmm3
 437         movdqa  57(%esi,%edx),%xmm4
 438
 439         movdqa  %xmm0,%xmm5
 440         movdqa  %xmm4,%xmm0
 441
 442         palignr $7,%xmm3,%xmm4  // dest <- shr( dest || source, imm*8 )
 443         palignr $7,%xmm2,%xmm3
 444         palignr $7,%xmm1,%xmm2
 445         palignr $7,%xmm5,%xmm1
 446
 447         movdqa  %xmm1,(%edi,%edx)
 448         movdqa  %xmm2,16(%edi,%edx)
 449         movdqa  %xmm3,32(%edi,%edx)
 450         movdqa  %xmm4,48(%edi,%edx)
 451
 452         addl    $64,%edx
 453         jnz     1b
 454
 455         jmp     Lshort                  // copy remaining 0..63 bytes and done
 456
 457
 458 // Forward loop for medium length operands in which low four bits of %esi == 1000
 459 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
 460
 461 LMod8:
 462         cmpl    $(-kFastUCode),%edx// %edx == -length, where (length < kVeryLong)
 463         jle     Lfastpath       // long enough for fastpath in microcode
 464         movapd  -8(%esi,%edx),%xmm0// 8-byte aligned: prime the loop
 465         jmp     1f
 466         .align  4,0x90
 467 1:                              // loop over 64-byte chunks
 468         movapd  8(%esi,%edx),%xmm1
 469         movapd  24(%esi,%edx),%xmm2
 470         shufpd  $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
 471         movapd  40(%esi,%edx),%xmm3
 472         shufpd  $01,%xmm2,%xmm1
 473         movapd  56(%esi,%edx),%xmm4
 474         shufpd  $01,%xmm3,%xmm2
 475
 476         movapd  %xmm0,(%edi,%edx)
 477         shufpd  $01,%xmm4,%xmm3
 478         movapd  %xmm1,16(%edi,%edx)
 479         movapd  %xmm2,32(%edi,%edx)
 480         movapd  %xmm4,%xmm0
 481         movapd  %xmm3,48(%edi,%edx)
 482
 483         addl    $64,%edx
 484         jnz     1b
 485
 486         jmp     Lshort                  // copy remaining 0..63 bytes and done
 487
 488
 489 // Forward loop for medium length operands in which low four bits of %esi == 1001
 490
 491 LMod9:
 492         movdqa  -9(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
 493 1:                              // loop over 64-byte chunks
 494         movdqa  7(%esi,%edx),%xmm1
 495         movdqa  23(%esi,%edx),%xmm2
 496         movdqa  39(%esi,%edx),%xmm3
 497         movdqa  55(%esi,%edx),%xmm4
 498
 499         movdqa  %xmm0,%xmm5
 500         movdqa  %xmm4,%xmm0
 501
 502         palignr $9,%xmm3,%xmm4  // dest <- shr( dest || source, imm*8 )
 503         palignr $9,%xmm2,%xmm3
 504         palignr $9,%xmm1,%xmm2
 505         palignr $9,%xmm5,%xmm1
 506
 507         movdqa  %xmm1,(%edi,%edx)
 508         movdqa  %xmm2,16(%edi,%edx)
 509         movdqa  %xmm3,32(%edi,%edx)
 510         movdqa  %xmm4,48(%edi,%edx)
 511
 512         addl    $64,%edx
 513         jnz     1b
 514
 515         jmp     Lshort                  // copy remaining 0..63 bytes and done
 516
 517
 518 // Forward loop for medium length operands in which low four bits of %esi == 1010
 519
 520 LMod10:
 521         movdqa  -10(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
 522 1:                              // loop over 64-byte chunks
 523         movdqa  6(%esi,%edx),%xmm1
 524         movdqa  22(%esi,%edx),%xmm2
 525         movdqa  38(%esi,%edx),%xmm3
 526         movdqa  54(%esi,%edx),%xmm4
 527
 528         movdqa  %xmm0,%xmm5
 529         movdqa  %xmm4,%xmm0
 530
 531         palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
 532         palignr $10,%xmm2,%xmm3
 533         palignr $10,%xmm1,%xmm2
 534         palignr $10,%xmm5,%xmm1
 535
 536         movdqa  %xmm1,(%edi,%edx)
 537         movdqa  %xmm2,16(%edi,%edx)
 538         movdqa  %xmm3,32(%edi,%edx)
 539         movdqa  %xmm4,48(%edi,%edx)
 540
 541         addl    $64,%edx
 542         jnz     1b
 543
 544         jmp     Lshort                  // copy remaining 0..63 bytes and done
 545
 546
 547 // Forward loop for medium length operands in which low four bits of %esi == 1011
 548
 549 LMod11:
 550         movdqa  -11(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
 551 1:                              // loop over 64-byte chunks
 552         movdqa  5(%esi,%edx),%xmm1
 553         movdqa  21(%esi,%edx),%xmm2
 554         movdqa  37(%esi,%edx),%xmm3
 555         movdqa  53(%esi,%edx),%xmm4
 556
 557         movdqa  %xmm0,%xmm5
 558         movdqa  %xmm4,%xmm0
 559
 560         palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
 561         palignr $11,%xmm2,%xmm3
 562         palignr $11,%xmm1,%xmm2
 563         palignr $11,%xmm5,%xmm1
 564
 565         movdqa  %xmm1,(%edi,%edx)
 566         movdqa  %xmm2,16(%edi,%edx)
 567         movdqa  %xmm3,32(%edi,%edx)
 568         movdqa  %xmm4,48(%edi,%edx)
 569
 570         addl    $64,%edx
 571         jnz     1b
 572
 573         jmp     Lshort                  // copy remaining 0..63 bytes and done
 574
 575
 576 // Forward loop for medium length operands in which low four bits of %esi == 1100
 577 // We use the float single data type in order to use "movss" to merge vectors.
 578
 579 LMod12:
 580         movss   (%esi,%edx),%xmm0// prefetch 1st four bytes of source, right justified
 581         jmp     1f
 582         .align  4,0x90
 583 1:                              // loop over 64-byte chunks
 584         pshufd  $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
 585         pshufd  $(0x93),20(%esi,%edx),%xmm2
 586         pshufd  $(0x93),36(%esi,%edx),%xmm3
 587         pshufd  $(0x93),52(%esi,%edx),%xmm4
 588
 589         movaps  %xmm4,%xmm5
 590         movss   %xmm3,%xmm4     // copy low 4 bytes of source into destination
 591         movss   %xmm2,%xmm3
 592         movss   %xmm1,%xmm2
 593         movss   %xmm0,%xmm1
 594
 595         movaps  %xmm1,(%edi,%edx)
 596         movaps  %xmm2,16(%edi,%edx)
 597         movaps  %xmm5,%xmm0
 598         movaps  %xmm3,32(%edi,%edx)
 599         movaps  %xmm4,48(%edi,%edx)
 600
 601         addl    $64,%edx
 602         jnz     1b
 603
 604         jmp     Lshort                  // copy remaining 0..63 bytes and done
 605
 606
 607 // Forward loop for medium length operands in which low four bits of %esi == 1101
 608
 609 LMod13:
 610         movdqa  -13(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
 611 1:                              // loop over 64-byte chunks
 612         movdqa  3(%esi,%edx),%xmm1
 613         movdqa  19(%esi,%edx),%xmm2
 614         movdqa  35(%esi,%edx),%xmm3
 615         movdqa  51(%esi,%edx),%xmm4
 616
 617         movdqa  %xmm0,%xmm5
 618         movdqa  %xmm4,%xmm0
 619
 620         palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
 621         palignr $13,%xmm2,%xmm3
 622         palignr $13,%xmm1,%xmm2
 623         palignr $13,%xmm5,%xmm1
 624
 625         movdqa  %xmm1,(%edi,%edx)
 626         movdqa  %xmm2,16(%edi,%edx)
 627         movdqa  %xmm3,32(%edi,%edx)
 628         movdqa  %xmm4,48(%edi,%edx)
 629
 630         addl    $64,%edx
 631         jnz     1b
 632
 633         jmp     Lshort                  // copy remaining 0..63 bytes and done
 634
 635
 636 // Forward loop for medium length operands in which low four bits of %esi == 1110
 637
 638 LMod14:
 639         movdqa  -14(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
 640 1:                              // loop over 64-byte chunks
 641         movdqa  2(%esi,%edx),%xmm1
 642         movdqa  18(%esi,%edx),%xmm2
 643         movdqa  34(%esi,%edx),%xmm3
 644         movdqa  50(%esi,%edx),%xmm4
 645
 646         movdqa  %xmm0,%xmm5
 647         movdqa  %xmm4,%xmm0
 648
 649         palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
 650         palignr $14,%xmm2,%xmm3
 651         palignr $14,%xmm1,%xmm2
 652         palignr $14,%xmm5,%xmm1
 653
 654         movdqa  %xmm1,(%edi,%edx)
 655         movdqa  %xmm2,16(%edi,%edx)
 656         movdqa  %xmm3,32(%edi,%edx)
 657         movdqa  %xmm4,48(%edi,%edx)
 658
 659         addl    $64,%edx
 660         jnz     1b
 661
 662         jmp     Lshort                  // copy remaining 0..63 bytes and done
 663
 664
 665 // Forward loop for medium length operands in which low four bits of %esi == 1111
 666
 667 LMod15:
 668         movdqa  -15(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
 669 1:                              // loop over 64-byte chunks
 670         movdqa  1(%esi,%edx),%xmm1
 671         movdqa  17(%esi,%edx),%xmm2
 672         movdqa  33(%esi,%edx),%xmm3
 673         movdqa  49(%esi,%edx),%xmm4
 674
 675         movdqa  %xmm0,%xmm5
 676         movdqa  %xmm4,%xmm0
 677
 678         palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
 679         palignr $15,%xmm2,%xmm3
 680         palignr $15,%xmm1,%xmm2
 681         palignr $15,%xmm5,%xmm1
 682
 683         movdqa  %xmm1,(%edi,%edx)
 684         movdqa  %xmm2,16(%edi,%edx)
 685         movdqa  %xmm3,32(%edi,%edx)
 686         movdqa  %xmm4,48(%edi,%edx)
 687
 688         addl    $64,%edx
 689         jnz     1b
 690
 691         jmp     Lshort                  // copy remaining 0..63 bytes and done
 692
 693
 694 // Reverse moves.  These are not optimized as aggressively as their forward
 695 // counterparts, as they are only used with destructive overlap.
 696 //      ecx = length
 697 //      esi = source ptr
 698 //      edi = dest ptr
 699
 700 LReverse:
 701         addl    %ecx,%esi               // point to end of strings
 702         addl    %ecx,%edi
 703         cmpl    $(kShort),%ecx          // long enough to bother with SSE?
 704         ja      LReverseNotShort        // yes
 705
 706 // Handle reverse short copies.
 707 //      ecx = length
 708 //      esi = one byte past end of source
 709 //      edi = one byte past end of dest
 710
 711 LReverseShort:
 712         movl    %ecx,%edx       // copy length
 713         shrl    $2,%ecx         // #words
 714         jz      3f
 715 1:
 716         subl    $4,%esi
 717         movl    (%esi),%eax
 718         subl    $4,%edi
 719         movl    %eax,(%edi)
 720         dec     %ecx
 721         jnz     1b
 722 3:
 723         andl    $3,%edx         // bytes?
 724         jz      5f
 725 4:
 726         dec     %esi
 727         movb    (%esi),%al
 728         dec     %edi
 729         movb    %al,(%edi)
 730         dec     %edx
 731         jnz     4b
 732 5:
 733         movl    8(%ebp),%eax    // get return value (dst ptr) for memcpy/memmove
 734         popl    %ebx
 735         popl    %edi
 736         popl    %esi
 737         popl    %ebp
 738         ret
 739
 740 // Handle a reverse move long enough to justify using SSE.
 741 //      ecx = length
 742 //      esi = one byte past end of source
 743 //      edi = one byte past end of dest
 744
 745 LReverseNotShort:
 746         movl    %edi,%edx               // copy destination
 747         andl    $15,%edx                // get #bytes to align destination
 748         je      LReverseDestAligned     // already aligned
 749         subl    %edx,%ecx       // adjust length
 750 1:                              // loop copying 1..15 bytes
 751         dec     %esi
 752         movb    (%esi),%al
 753         dec     %edi
 754         movb    %al,(%edi)
 755         dec     %edx
 756         jnz     1b
 757
 758 // Destination is now aligned.  Prepare for reverse loops.
 759
 760 LReverseDestAligned:
 761         movl    %ecx,%edx               // copy length
 762         andl    $63,%ecx                // get remaining bytes for Lshort
 763         andl    $-64,%edx               // get number of bytes we will copy in inner loop
 764         subl    %edx,%esi               // point to endpoint of copy
 765         subl    %edx,%edi
 766         testl   $15,%esi        // is source aligned too?
 767         jnz     LReverseUnalignedLoop   // no
 768
 769 LReverseAlignedLoop:                    // loop over 64-byte chunks
 770         movdqa  -16(%esi,%edx),%xmm0
 771         movdqa  -32(%esi,%edx),%xmm1
 772         movdqa  -48(%esi,%edx),%xmm2
 773         movdqa  -64(%esi,%edx),%xmm3
 774
 775         movdqa  %xmm0,-16(%edi,%edx)
 776         movdqa  %xmm1,-32(%edi,%edx)
 777         movdqa  %xmm2,-48(%edi,%edx)
 778         movdqa  %xmm3,-64(%edi,%edx)
 779
 780         subl    $64,%edx
 781         jne     LReverseAlignedLoop
 782
 783         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 784
 785
 786 // Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
 787
 788 LReverseUnalignedLoop:                  // loop over 64-byte chunks
 789         movdqu  -16(%esi,%edx),%xmm0
 790         movdqu  -32(%esi,%edx),%xmm1
 791         movdqu  -48(%esi,%edx),%xmm2
 792         movdqu  -64(%esi,%edx),%xmm3
 793
 794         movdqa  %xmm0,-16(%edi,%edx)
 795         movdqa  %xmm1,-32(%edi,%edx)
 796         movdqa  %xmm2,-48(%edi,%edx)
 797         movdqa  %xmm3,-64(%edi,%edx)
 798
 799         subl    $64,%edx
 800         jne     LReverseUnalignedLoop
 801
 802         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 803
 804 PLATFUNC_DESCRIPTOR(bcopy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
 805 PLATFUNC_DESCRIPTOR(memcpy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
 806 PLATFUNC_DESCRIPTOR(memmove,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)