2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <machine/cpu_capabilities.h>
33 * The bcopy/memcpy loops, tuned for Pentium-M class processors with
34 * Supplemental SSE3 and 64-byte cache lines.
36 * The following #defines are tightly coupled to the u-architecture:
39 #define kShort 80 // too short to bother with SSE (must be >=80)
40 #define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
41 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
43 // void bcopy(const void *src, void *dst, size_t len);
45 PLATFUNC_FUNCTION_START(bcopy, sse3x, 32, 5)
46 pushl %ebp // set up a frame for backtraces
51 movl 8(%ebp),%esi // get source ptr
52 movl 12(%ebp),%edi // get dest ptr
53 movl 16(%ebp),%ecx // get length
55 subl %esi,%edx // (dest - source)
56 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
58 cmpl $(kShort),%ecx // long enough to bother with SSE?
63 // void *memcpy(void *dst, const void *src, size_t len);
64 // void *memmove(void *dst, const void *src, size_t len);
67 PLATFUNC_FUNCTION_START(memcpy, sse3x, 32, 0) // void *memcpy(void *dst, const void *src, size_t len)
68 PLATFUNC_FUNCTION_START(memmove, sse3x, 32, 0) // void *memmove(void *dst, const void *src, size_t len)
69 pushl %ebp // set up a frame for backtraces
74 movl 8(%ebp),%edi // get dest ptr
75 movl 12(%ebp),%esi // get source ptr
76 movl 16(%ebp),%ecx // get length
78 subl %esi,%edx // (dest - source)
79 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
81 cmpl $(kShort),%ecx // long enough to bother with SSE?
84 // Handle short forward copies. As the most common case, this is the fall-through path.
85 // ecx = length (<= kShort)
90 movl %ecx,%edx // copy length
91 shrl $2,%ecx // get #doublewords
93 2: // loop copying doublewords
100 LLeftovers: // handle leftover bytes (0..3) in last word
101 andl $3,%edx // any leftover bytes?
103 4: // loop copying bytes
111 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
119 LReverseIsland: // keep the "jb" above a short branch...
120 jmp LReverse // ...because reverse moves are uncommon
123 // Handle forward moves that are long enough to justify use of SSE3.
124 // First, 16-byte align the destination.
125 // ecx = length (> kShort)
130 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
131 movl %edi,%edx // copy destination
132 jae LVeryLong // use very-long-operand path
134 andl $15,%edx // get #bytes to align destination
135 jz LDestAligned // already aligned
136 subl %edx,%ecx // decrement length
137 1: // loop copying 1..15 bytes
145 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
146 // based on the alignment of the source. All vector loads and stores are aligned.
147 // Even though this means we have to shift and repack vectors, doing so is much faster
148 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
149 // there is at least one chunk. When we enter the copy loops, the following registers
151 // ecx = residual length (0..63)
152 // edx = -(length to move), a multiple of 64
153 // esi = ptr to 1st source byte not to move (unaligned)
154 // edi = ptr to 1st dest byte not to move (aligned)
157 movl %ecx,%edx // copy length
158 movl %esi,%eax // copy source address
159 andl $63,%ecx // get remaining bytes for Lshort
160 andl $-64,%edx // get number of bytes we will copy in inner loop
161 andl $15,%eax // mask to low 4 bits of source address
162 addl %edx,%esi // point to 1st byte not copied
164 negl %edx // now generate offset to 1st byte to be copied
168 movl (LTable-1b)(%ebx,%eax,4), %eax // load jump table entry address, relative to LZero
169 leal (LTable-1b)(%ebx,%eax,1), %eax
173 LTable: // table of copy loop addresses
192 // Very long forward moves. These are at least several pages. They are special cased
193 // and aggressively optimized, not so much because they are common or useful, but
194 // because they are subject to benchmark. There isn't enough room for them in the
195 // area reserved on the platfunc for bcopy, so we put them elsewhere. We call
196 // the longcopy routine using the normal ABI.
199 pushl %ecx // length (>= kVeryLong)
200 pushl %esi // source ptr
201 pushl %edi // dest ptr
203 addl $12,%esp // pop off our parameters
207 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
208 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
209 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
210 // avoids having to read destination cache lines that will be completely overwritten.
211 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
212 // we do not know if the destination is in cache or not.
215 addl %edx,%esi // restore ptrs to 1st byte of source and dest
217 negl %edx // make length positive
218 orl %edx,%ecx // restore total #bytes remaining to move
219 cld // we'll move forward
220 movl %ecx,%edx // copy total length to move
221 shrl $2,%ecx // compute #words to move
222 rep // the u-code will optimize this
224 jmp LLeftovers // handle 0..3 leftover bytes
227 // Forward loop for medium length operands in which low four bits of %esi == 0000
230 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
231 jle Lfastpath // long enough for fastpath in microcode
233 .align 4,0x90 // 16-byte align inner loops
234 1: // loop over 64-byte chunks
235 movdqa (%esi,%edx),%xmm0
236 movdqa 16(%esi,%edx),%xmm1
237 movdqa 32(%esi,%edx),%xmm2
238 movdqa 48(%esi,%edx),%xmm3
240 movdqa %xmm0,(%edi,%edx)
241 movdqa %xmm1,16(%edi,%edx)
242 movdqa %xmm2,32(%edi,%edx)
243 movdqa %xmm3,48(%edi,%edx)
248 jmp Lshort // copy remaining 0..63 bytes and done
251 // Forward loop for medium length operands in which low four bits of %esi == 0001
254 movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
255 1: // loop over 64-byte chunks
256 movdqa 15(%esi,%edx),%xmm1
257 movdqa 31(%esi,%edx),%xmm2
258 movdqa 47(%esi,%edx),%xmm3
259 movdqa 63(%esi,%edx),%xmm4
264 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
265 palignr $1,%xmm2,%xmm3
266 palignr $1,%xmm1,%xmm2
267 palignr $1,%xmm5,%xmm1
269 movdqa %xmm1,(%edi,%edx)
270 movdqa %xmm2,16(%edi,%edx)
271 movdqa %xmm3,32(%edi,%edx)
272 movdqa %xmm4,48(%edi,%edx)
277 jmp Lshort // copy remaining 0..63 bytes and done
280 // Forward loop for medium length operands in which low four bits of %esi == 0010
283 movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
284 1: // loop over 64-byte chunks
285 movdqa 14(%esi,%edx),%xmm1
286 movdqa 30(%esi,%edx),%xmm2
287 movdqa 46(%esi,%edx),%xmm3
288 movdqa 62(%esi,%edx),%xmm4
293 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
294 palignr $2,%xmm2,%xmm3
295 palignr $2,%xmm1,%xmm2
296 palignr $2,%xmm5,%xmm1
298 movdqa %xmm1,(%edi,%edx)
299 movdqa %xmm2,16(%edi,%edx)
300 movdqa %xmm3,32(%edi,%edx)
301 movdqa %xmm4,48(%edi,%edx)
306 jmp Lshort // copy remaining 0..63 bytes and done
309 // Forward loop for medium length operands in which low four bits of %esi == 0011
312 movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
313 1: // loop over 64-byte chunks
314 movdqa 13(%esi,%edx),%xmm1
315 movdqa 29(%esi,%edx),%xmm2
316 movdqa 45(%esi,%edx),%xmm3
317 movdqa 61(%esi,%edx),%xmm4
322 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
323 palignr $3,%xmm2,%xmm3
324 palignr $3,%xmm1,%xmm2
325 palignr $3,%xmm5,%xmm1
327 movdqa %xmm1,(%edi,%edx)
328 movdqa %xmm2,16(%edi,%edx)
329 movdqa %xmm3,32(%edi,%edx)
330 movdqa %xmm4,48(%edi,%edx)
335 jmp Lshort // copy remaining 0..63 bytes and done
338 // Forward loop for medium length operands in which low four bits of %esi == 0100
339 // We use the float single data type in order to use "movss" to merge vectors.
342 movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
345 1: // loop over 64-byte chunks
346 movaps 12(%esi,%edx),%xmm1
347 movaps 28(%esi,%edx),%xmm2
348 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
349 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
350 movaps 44(%esi,%edx),%xmm3
352 pshufd $(0x39),%xmm1,%xmm1
353 movaps 60(%esi,%edx),%xmm4
355 pshufd $(0x39),%xmm2,%xmm2
357 movaps %xmm0,(%edi,%edx)
359 pshufd $(0x39),%xmm3,%xmm3
360 movaps %xmm1,16(%edi,%edx)
361 movaps %xmm2,32(%edi,%edx)
363 movaps %xmm3,48(%edi,%edx)
368 jmp Lshort // copy remaining 0..63 bytes and done
371 // Forward loop for medium length operands in which low four bits of %esi == 0101
374 movdqa -5(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
375 1: // loop over 64-byte chunks
376 movdqa 11(%esi,%edx),%xmm1
377 movdqa 27(%esi,%edx),%xmm2
378 movdqa 43(%esi,%edx),%xmm3
379 movdqa 59(%esi,%edx),%xmm4
384 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
385 palignr $5,%xmm2,%xmm3
386 palignr $5,%xmm1,%xmm2
387 palignr $5,%xmm5,%xmm1
389 movdqa %xmm1,(%edi,%edx)
390 movdqa %xmm2,16(%edi,%edx)
391 movdqa %xmm3,32(%edi,%edx)
392 movdqa %xmm4,48(%edi,%edx)
397 jmp Lshort // copy remaining 0..63 bytes and done
400 // Forward loop for medium length operands in which low four bits of %esi == 0110
403 movdqa -6(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
404 1: // loop over 64-byte chunks
405 movdqa 10(%esi,%edx),%xmm1
406 movdqa 26(%esi,%edx),%xmm2
407 movdqa 42(%esi,%edx),%xmm3
408 movdqa 58(%esi,%edx),%xmm4
413 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
414 palignr $6,%xmm2,%xmm3
415 palignr $6,%xmm1,%xmm2
416 palignr $6,%xmm5,%xmm1
418 movdqa %xmm1,(%edi,%edx)
419 movdqa %xmm2,16(%edi,%edx)
420 movdqa %xmm3,32(%edi,%edx)
421 movdqa %xmm4,48(%edi,%edx)
426 jmp Lshort // copy remaining 0..63 bytes and done
429 // Forward loop for medium length operands in which low four bits of %esi == 0111
432 movdqa -7(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
433 1: // loop over 64-byte chunks
434 movdqa 9(%esi,%edx),%xmm1
435 movdqa 25(%esi,%edx),%xmm2
436 movdqa 41(%esi,%edx),%xmm3
437 movdqa 57(%esi,%edx),%xmm4
442 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
443 palignr $7,%xmm2,%xmm3
444 palignr $7,%xmm1,%xmm2
445 palignr $7,%xmm5,%xmm1
447 movdqa %xmm1,(%edi,%edx)
448 movdqa %xmm2,16(%edi,%edx)
449 movdqa %xmm3,32(%edi,%edx)
450 movdqa %xmm4,48(%edi,%edx)
455 jmp Lshort // copy remaining 0..63 bytes and done
458 // Forward loop for medium length operands in which low four bits of %esi == 1000
459 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
462 cmpl $(-kFastUCode),%edx// %edx == -length, where (length < kVeryLong)
463 jle Lfastpath // long enough for fastpath in microcode
464 movapd -8(%esi,%edx),%xmm0// 8-byte aligned: prime the loop
467 1: // loop over 64-byte chunks
468 movapd 8(%esi,%edx),%xmm1
469 movapd 24(%esi,%edx),%xmm2
470 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
471 movapd 40(%esi,%edx),%xmm3
472 shufpd $01,%xmm2,%xmm1
473 movapd 56(%esi,%edx),%xmm4
474 shufpd $01,%xmm3,%xmm2
476 movapd %xmm0,(%edi,%edx)
477 shufpd $01,%xmm4,%xmm3
478 movapd %xmm1,16(%edi,%edx)
479 movapd %xmm2,32(%edi,%edx)
481 movapd %xmm3,48(%edi,%edx)
486 jmp Lshort // copy remaining 0..63 bytes and done
489 // Forward loop for medium length operands in which low four bits of %esi == 1001
492 movdqa -9(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
493 1: // loop over 64-byte chunks
494 movdqa 7(%esi,%edx),%xmm1
495 movdqa 23(%esi,%edx),%xmm2
496 movdqa 39(%esi,%edx),%xmm3
497 movdqa 55(%esi,%edx),%xmm4
502 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
503 palignr $9,%xmm2,%xmm3
504 palignr $9,%xmm1,%xmm2
505 palignr $9,%xmm5,%xmm1
507 movdqa %xmm1,(%edi,%edx)
508 movdqa %xmm2,16(%edi,%edx)
509 movdqa %xmm3,32(%edi,%edx)
510 movdqa %xmm4,48(%edi,%edx)
515 jmp Lshort // copy remaining 0..63 bytes and done
518 // Forward loop for medium length operands in which low four bits of %esi == 1010
521 movdqa -10(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
522 1: // loop over 64-byte chunks
523 movdqa 6(%esi,%edx),%xmm1
524 movdqa 22(%esi,%edx),%xmm2
525 movdqa 38(%esi,%edx),%xmm3
526 movdqa 54(%esi,%edx),%xmm4
531 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
532 palignr $10,%xmm2,%xmm3
533 palignr $10,%xmm1,%xmm2
534 palignr $10,%xmm5,%xmm1
536 movdqa %xmm1,(%edi,%edx)
537 movdqa %xmm2,16(%edi,%edx)
538 movdqa %xmm3,32(%edi,%edx)
539 movdqa %xmm4,48(%edi,%edx)
544 jmp Lshort // copy remaining 0..63 bytes and done
547 // Forward loop for medium length operands in which low four bits of %esi == 1011
550 movdqa -11(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
551 1: // loop over 64-byte chunks
552 movdqa 5(%esi,%edx),%xmm1
553 movdqa 21(%esi,%edx),%xmm2
554 movdqa 37(%esi,%edx),%xmm3
555 movdqa 53(%esi,%edx),%xmm4
560 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
561 palignr $11,%xmm2,%xmm3
562 palignr $11,%xmm1,%xmm2
563 palignr $11,%xmm5,%xmm1
565 movdqa %xmm1,(%edi,%edx)
566 movdqa %xmm2,16(%edi,%edx)
567 movdqa %xmm3,32(%edi,%edx)
568 movdqa %xmm4,48(%edi,%edx)
573 jmp Lshort // copy remaining 0..63 bytes and done
576 // Forward loop for medium length operands in which low four bits of %esi == 1100
577 // We use the float single data type in order to use "movss" to merge vectors.
580 movss (%esi,%edx),%xmm0// prefetch 1st four bytes of source, right justified
583 1: // loop over 64-byte chunks
584 pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
585 pshufd $(0x93),20(%esi,%edx),%xmm2
586 pshufd $(0x93),36(%esi,%edx),%xmm3
587 pshufd $(0x93),52(%esi,%edx),%xmm4
590 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
595 movaps %xmm1,(%edi,%edx)
596 movaps %xmm2,16(%edi,%edx)
598 movaps %xmm3,32(%edi,%edx)
599 movaps %xmm4,48(%edi,%edx)
604 jmp Lshort // copy remaining 0..63 bytes and done
607 // Forward loop for medium length operands in which low four bits of %esi == 1101
610 movdqa -13(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
611 1: // loop over 64-byte chunks
612 movdqa 3(%esi,%edx),%xmm1
613 movdqa 19(%esi,%edx),%xmm2
614 movdqa 35(%esi,%edx),%xmm3
615 movdqa 51(%esi,%edx),%xmm4
620 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
621 palignr $13,%xmm2,%xmm3
622 palignr $13,%xmm1,%xmm2
623 palignr $13,%xmm5,%xmm1
625 movdqa %xmm1,(%edi,%edx)
626 movdqa %xmm2,16(%edi,%edx)
627 movdqa %xmm3,32(%edi,%edx)
628 movdqa %xmm4,48(%edi,%edx)
633 jmp Lshort // copy remaining 0..63 bytes and done
636 // Forward loop for medium length operands in which low four bits of %esi == 1110
639 movdqa -14(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
640 1: // loop over 64-byte chunks
641 movdqa 2(%esi,%edx),%xmm1
642 movdqa 18(%esi,%edx),%xmm2
643 movdqa 34(%esi,%edx),%xmm3
644 movdqa 50(%esi,%edx),%xmm4
649 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
650 palignr $14,%xmm2,%xmm3
651 palignr $14,%xmm1,%xmm2
652 palignr $14,%xmm5,%xmm1
654 movdqa %xmm1,(%edi,%edx)
655 movdqa %xmm2,16(%edi,%edx)
656 movdqa %xmm3,32(%edi,%edx)
657 movdqa %xmm4,48(%edi,%edx)
662 jmp Lshort // copy remaining 0..63 bytes and done
665 // Forward loop for medium length operands in which low four bits of %esi == 1111
668 movdqa -15(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
669 1: // loop over 64-byte chunks
670 movdqa 1(%esi,%edx),%xmm1
671 movdqa 17(%esi,%edx),%xmm2
672 movdqa 33(%esi,%edx),%xmm3
673 movdqa 49(%esi,%edx),%xmm4
678 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
679 palignr $15,%xmm2,%xmm3
680 palignr $15,%xmm1,%xmm2
681 palignr $15,%xmm5,%xmm1
683 movdqa %xmm1,(%edi,%edx)
684 movdqa %xmm2,16(%edi,%edx)
685 movdqa %xmm3,32(%edi,%edx)
686 movdqa %xmm4,48(%edi,%edx)
691 jmp Lshort // copy remaining 0..63 bytes and done
694 // Reverse moves. These are not optimized as aggressively as their forward
695 // counterparts, as they are only used with destructive overlap.
701 addl %ecx,%esi // point to end of strings
703 cmpl $(kShort),%ecx // long enough to bother with SSE?
704 ja LReverseNotShort // yes
706 // Handle reverse short copies.
708 // esi = one byte past end of source
709 // edi = one byte past end of dest
712 movl %ecx,%edx // copy length
713 shrl $2,%ecx // #words
723 andl $3,%edx // bytes?
733 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
740 // Handle a reverse move long enough to justify using SSE.
742 // esi = one byte past end of source
743 // edi = one byte past end of dest
746 movl %edi,%edx // copy destination
747 andl $15,%edx // get #bytes to align destination
748 je LReverseDestAligned // already aligned
749 subl %edx,%ecx // adjust length
750 1: // loop copying 1..15 bytes
758 // Destination is now aligned. Prepare for reverse loops.
761 movl %ecx,%edx // copy length
762 andl $63,%ecx // get remaining bytes for Lshort
763 andl $-64,%edx // get number of bytes we will copy in inner loop
764 subl %edx,%esi // point to endpoint of copy
766 testl $15,%esi // is source aligned too?
767 jnz LReverseUnalignedLoop // no
769 LReverseAlignedLoop: // loop over 64-byte chunks
770 movdqa -16(%esi,%edx),%xmm0
771 movdqa -32(%esi,%edx),%xmm1
772 movdqa -48(%esi,%edx),%xmm2
773 movdqa -64(%esi,%edx),%xmm3
775 movdqa %xmm0,-16(%edi,%edx)
776 movdqa %xmm1,-32(%edi,%edx)
777 movdqa %xmm2,-48(%edi,%edx)
778 movdqa %xmm3,-64(%edi,%edx)
781 jne LReverseAlignedLoop
783 jmp LReverseShort // copy remaining 0..63 bytes and done
786 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
788 LReverseUnalignedLoop: // loop over 64-byte chunks
789 movdqu -16(%esi,%edx),%xmm0
790 movdqu -32(%esi,%edx),%xmm1
791 movdqu -48(%esi,%edx),%xmm2
792 movdqu -64(%esi,%edx),%xmm3
794 movdqa %xmm0,-16(%edi,%edx)
795 movdqa %xmm1,-32(%edi,%edx)
796 movdqa %xmm2,-48(%edi,%edx)
797 movdqa %xmm3,-64(%edi,%edx)
800 jne LReverseUnalignedLoop
802 jmp LReverseShort // copy remaining 0..63 bytes and done
804 PLATFUNC_DESCRIPTOR(bcopy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
805 PLATFUNC_DESCRIPTOR(memcpy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
806 PLATFUNC_DESCRIPTOR(memmove,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)