2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
33 * The bcopy/memcpy loops, tuned for Pentium-M class processors with
34 * Supplemental SSE3 and 64-byte cache lines.
36 * The following #defines are tightly coupled to the u-architecture:
39 #define kShort 80 // too short to bother with SSE (must be >=80)
40 #define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
41 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
43 // void bcopy(const void *src, void *dst, size_t len);
45 COMMPAGE_FUNCTION_START(bcopy_sse3x, 32, 5)
47 pushl %ebp // set up a frame for backtraces
51 movl 8(%ebp),%esi // get source ptr
52 movl 12(%ebp),%edi // get dest ptr
53 movl 16(%ebp),%ecx // get length
55 subl %esi,%edx // (dest - source)
56 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
58 cmpl $(kShort),%ecx // long enough to bother with SSE?
63 // void *memcpy(void *dst, const void *src, size_t len);
64 // void *memmove(void *dst, const void *src, size_t len);
66 // NB: These need to be 32 bytes from bcopy():
70 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
71 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
72 pushl %ebp // set up a frame for backtraces
76 movl 8(%ebp),%edi // get dest ptr
77 movl 12(%ebp),%esi // get source ptr
78 movl 16(%ebp),%ecx // get length
80 subl %esi,%edx // (dest - source)
81 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
83 cmpl $(kShort),%ecx // long enough to bother with SSE?
86 // Handle short forward copies. As the most common case, this is the fall-through path.
87 // ecx = length (<= kShort)
92 movl %ecx,%edx // copy length
93 shrl $2,%ecx // get #doublewords
95 2: // loop copying doublewords
102 LLeftovers: // handle leftover bytes (0..3) in last word
103 andl $3,%edx // any leftover bytes?
105 4: // loop copying bytes
113 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
120 LReverseIsland: // keep the "jb" above a short branch...
121 jmp LReverse // ...because reverse moves are uncommon
124 // Handle forward moves that are long enough to justify use of SSE3.
125 // First, 16-byte align the destination.
126 // ecx = length (> kShort)
131 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
132 movl %edi,%edx // copy destination
133 jae LVeryLong // use very-long-operand path
135 andl $15,%edx // get #bytes to align destination
136 jz LDestAligned // already aligned
137 subl %edx,%ecx // decrement length
138 1: // loop copying 1..15 bytes
146 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
147 // based on the alignment of the source. All vector loads and stores are aligned.
148 // Even though this means we have to shift and repack vectors, doing so is much faster
149 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
150 // there is at least one chunk. When we enter the copy loops, the following registers
152 // ecx = residual length (0..63)
153 // edx = -(length to move), a multiple of 64
154 // esi = ptr to 1st source byte not to move (unaligned)
155 // edi = ptr to 1st dest byte not to move (aligned)
158 movl %ecx,%edx // copy length
159 movl %esi,%eax // copy source address
160 andl $63,%ecx // get remaining bytes for Lshort
161 andl $-64,%edx // get number of bytes we will copy in inner loop
162 andl $15,%eax // mask to low 4 bits of source address
163 addl %edx,%esi // point to 1st byte not copied
165 negl %edx // now generate offset to 1st byte to be copied
166 .set LTableOffset, LTable - LZero
167 leal (LTableOffset)(,%eax,4), %eax // load jump table entry address, relative to LZero
168 movl _COMM_PAGE_BCOPY(%eax), %eax // load jump table entry
169 addl $(_COMM_PAGE_BCOPY), %eax // add runtime address of LZero to get final function
173 LTable: // table of copy loop addresses
174 // force generation of assembly-time constants. Otherwise assembler
175 // creates subtractor relocations relative to first external symbol,
176 // and this file has none
177 .set LMod0Offset, LMod0 - LZero
178 .set LMod1Offset, LMod1 - LZero
179 .set LMod2Offset, LMod2 - LZero
180 .set LMod3Offset, LMod3 - LZero
181 .set LMod4Offset, LMod4 - LZero
182 .set LMod5Offset, LMod5 - LZero
183 .set LMod6Offset, LMod6 - LZero
184 .set LMod7Offset, LMod7 - LZero
185 .set LMod8Offset, LMod8 - LZero
186 .set LMod9Offset, LMod9 - LZero
187 .set LMod10Offset, LMod10 - LZero
188 .set LMod11Offset, LMod11 - LZero
189 .set LMod12Offset, LMod12 - LZero
190 .set LMod13Offset, LMod13 - LZero
191 .set LMod14Offset, LMod14 - LZero
192 .set LMod15Offset, LMod15 - LZero
211 // Very long forward moves. These are at least several pages. They are special cased
212 // and aggressively optimized, not so much because they are common or useful, but
213 // because they are subject to benchmark. There isn't enough room for them in the
214 // area reserved on the commpage for bcopy, so we put them elsewhere. We call
215 // the longcopy routine using the normal ABI.
218 pushl %ecx // length (>= kVeryLong)
219 pushl %esi // source ptr
220 pushl %edi // dest ptr
221 movl $(_COMM_PAGE_LONGCOPY),%eax
222 call *%eax // do the long copy
223 addl $12,%esp // pop off our parameters
227 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
228 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
229 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
230 // avoids having to read destination cache lines that will be completely overwritten.
231 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
232 // we do not know if the destination is in cache or not.
235 addl %edx,%esi // restore ptrs to 1st byte of source and dest
237 negl %edx // make length positive
238 orl %edx,%ecx // restore total #bytes remaining to move
239 cld // we'll move forward
240 movl %ecx,%edx // copy total length to move
241 shrl $2,%ecx // compute #words to move
242 rep // the u-code will optimize this
244 jmp LLeftovers // handle 0..3 leftover bytes
247 // Forward loop for medium length operands in which low four bits of %esi == 0000
250 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
251 jle Lfastpath // long enough for fastpath in microcode
253 .align 4,0x90 // 16-byte align inner loops
254 1: // loop over 64-byte chunks
255 movdqa (%esi,%edx),%xmm0
256 movdqa 16(%esi,%edx),%xmm1
257 movdqa 32(%esi,%edx),%xmm2
258 movdqa 48(%esi,%edx),%xmm3
260 movdqa %xmm0,(%edi,%edx)
261 movdqa %xmm1,16(%edi,%edx)
262 movdqa %xmm2,32(%edi,%edx)
263 movdqa %xmm3,48(%edi,%edx)
268 jmp Lshort // copy remaining 0..63 bytes and done
271 // Forward loop for medium length operands in which low four bits of %esi == 0001
274 movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
275 1: // loop over 64-byte chunks
276 movdqa 15(%esi,%edx),%xmm1
277 movdqa 31(%esi,%edx),%xmm2
278 movdqa 47(%esi,%edx),%xmm3
279 movdqa 63(%esi,%edx),%xmm4
284 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
285 palignr $1,%xmm2,%xmm3
286 palignr $1,%xmm1,%xmm2
287 palignr $1,%xmm5,%xmm1
289 movdqa %xmm1,(%edi,%edx)
290 movdqa %xmm2,16(%edi,%edx)
291 movdqa %xmm3,32(%edi,%edx)
292 movdqa %xmm4,48(%edi,%edx)
297 jmp Lshort // copy remaining 0..63 bytes and done
300 // Forward loop for medium length operands in which low four bits of %esi == 0010
303 movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
304 1: // loop over 64-byte chunks
305 movdqa 14(%esi,%edx),%xmm1
306 movdqa 30(%esi,%edx),%xmm2
307 movdqa 46(%esi,%edx),%xmm3
308 movdqa 62(%esi,%edx),%xmm4
313 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
314 palignr $2,%xmm2,%xmm3
315 palignr $2,%xmm1,%xmm2
316 palignr $2,%xmm5,%xmm1
318 movdqa %xmm1,(%edi,%edx)
319 movdqa %xmm2,16(%edi,%edx)
320 movdqa %xmm3,32(%edi,%edx)
321 movdqa %xmm4,48(%edi,%edx)
326 jmp Lshort // copy remaining 0..63 bytes and done
329 // Forward loop for medium length operands in which low four bits of %esi == 0011
332 movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
333 1: // loop over 64-byte chunks
334 movdqa 13(%esi,%edx),%xmm1
335 movdqa 29(%esi,%edx),%xmm2
336 movdqa 45(%esi,%edx),%xmm3
337 movdqa 61(%esi,%edx),%xmm4
342 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
343 palignr $3,%xmm2,%xmm3
344 palignr $3,%xmm1,%xmm2
345 palignr $3,%xmm5,%xmm1
347 movdqa %xmm1,(%edi,%edx)
348 movdqa %xmm2,16(%edi,%edx)
349 movdqa %xmm3,32(%edi,%edx)
350 movdqa %xmm4,48(%edi,%edx)
355 jmp Lshort // copy remaining 0..63 bytes and done
358 // Forward loop for medium length operands in which low four bits of %esi == 0100
359 // We use the float single data type in order to use "movss" to merge vectors.
362 movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
365 1: // loop over 64-byte chunks
366 movaps 12(%esi,%edx),%xmm1
367 movaps 28(%esi,%edx),%xmm2
368 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
369 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
370 movaps 44(%esi,%edx),%xmm3
372 pshufd $(0x39),%xmm1,%xmm1
373 movaps 60(%esi,%edx),%xmm4
375 pshufd $(0x39),%xmm2,%xmm2
377 movaps %xmm0,(%edi,%edx)
379 pshufd $(0x39),%xmm3,%xmm3
380 movaps %xmm1,16(%edi,%edx)
381 movaps %xmm2,32(%edi,%edx)
383 movaps %xmm3,48(%edi,%edx)
388 jmp Lshort // copy remaining 0..63 bytes and done
391 // Forward loop for medium length operands in which low four bits of %esi == 0101
394 movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
395 1: // loop over 64-byte chunks
396 movdqa 11(%esi,%edx),%xmm1
397 movdqa 27(%esi,%edx),%xmm2
398 movdqa 43(%esi,%edx),%xmm3
399 movdqa 59(%esi,%edx),%xmm4
404 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
405 palignr $5,%xmm2,%xmm3
406 palignr $5,%xmm1,%xmm2
407 palignr $5,%xmm5,%xmm1
409 movdqa %xmm1,(%edi,%edx)
410 movdqa %xmm2,16(%edi,%edx)
411 movdqa %xmm3,32(%edi,%edx)
412 movdqa %xmm4,48(%edi,%edx)
417 jmp Lshort // copy remaining 0..63 bytes and done
420 // Forward loop for medium length operands in which low four bits of %esi == 0110
423 movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
424 1: // loop over 64-byte chunks
425 movdqa 10(%esi,%edx),%xmm1
426 movdqa 26(%esi,%edx),%xmm2
427 movdqa 42(%esi,%edx),%xmm3
428 movdqa 58(%esi,%edx),%xmm4
433 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
434 palignr $6,%xmm2,%xmm3
435 palignr $6,%xmm1,%xmm2
436 palignr $6,%xmm5,%xmm1
438 movdqa %xmm1,(%edi,%edx)
439 movdqa %xmm2,16(%edi,%edx)
440 movdqa %xmm3,32(%edi,%edx)
441 movdqa %xmm4,48(%edi,%edx)
446 jmp Lshort // copy remaining 0..63 bytes and done
449 // Forward loop for medium length operands in which low four bits of %esi == 0111
452 movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
453 1: // loop over 64-byte chunks
454 movdqa 9(%esi,%edx),%xmm1
455 movdqa 25(%esi,%edx),%xmm2
456 movdqa 41(%esi,%edx),%xmm3
457 movdqa 57(%esi,%edx),%xmm4
462 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
463 palignr $7,%xmm2,%xmm3
464 palignr $7,%xmm1,%xmm2
465 palignr $7,%xmm5,%xmm1
467 movdqa %xmm1,(%edi,%edx)
468 movdqa %xmm2,16(%edi,%edx)
469 movdqa %xmm3,32(%edi,%edx)
470 movdqa %xmm4,48(%edi,%edx)
475 jmp Lshort // copy remaining 0..63 bytes and done
478 // Forward loop for medium length operands in which low four bits of %esi == 1000
479 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
482 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
483 jle Lfastpath // long enough for fastpath in microcode
484 movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop
487 1: // loop over 64-byte chunks
488 movapd 8(%esi,%edx),%xmm1
489 movapd 24(%esi,%edx),%xmm2
490 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
491 movapd 40(%esi,%edx),%xmm3
492 shufpd $01,%xmm2,%xmm1
493 movapd 56(%esi,%edx),%xmm4
494 shufpd $01,%xmm3,%xmm2
496 movapd %xmm0,(%edi,%edx)
497 shufpd $01,%xmm4,%xmm3
498 movapd %xmm1,16(%edi,%edx)
499 movapd %xmm2,32(%edi,%edx)
501 movapd %xmm3,48(%edi,%edx)
506 jmp Lshort // copy remaining 0..63 bytes and done
509 // Forward loop for medium length operands in which low four bits of %esi == 1001
512 movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
513 1: // loop over 64-byte chunks
514 movdqa 7(%esi,%edx),%xmm1
515 movdqa 23(%esi,%edx),%xmm2
516 movdqa 39(%esi,%edx),%xmm3
517 movdqa 55(%esi,%edx),%xmm4
522 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
523 palignr $9,%xmm2,%xmm3
524 palignr $9,%xmm1,%xmm2
525 palignr $9,%xmm5,%xmm1
527 movdqa %xmm1,(%edi,%edx)
528 movdqa %xmm2,16(%edi,%edx)
529 movdqa %xmm3,32(%edi,%edx)
530 movdqa %xmm4,48(%edi,%edx)
535 jmp Lshort // copy remaining 0..63 bytes and done
538 // Forward loop for medium length operands in which low four bits of %esi == 1010
541 movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
542 1: // loop over 64-byte chunks
543 movdqa 6(%esi,%edx),%xmm1
544 movdqa 22(%esi,%edx),%xmm2
545 movdqa 38(%esi,%edx),%xmm3
546 movdqa 54(%esi,%edx),%xmm4
551 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
552 palignr $10,%xmm2,%xmm3
553 palignr $10,%xmm1,%xmm2
554 palignr $10,%xmm5,%xmm1
556 movdqa %xmm1,(%edi,%edx)
557 movdqa %xmm2,16(%edi,%edx)
558 movdqa %xmm3,32(%edi,%edx)
559 movdqa %xmm4,48(%edi,%edx)
564 jmp Lshort // copy remaining 0..63 bytes and done
567 // Forward loop for medium length operands in which low four bits of %esi == 1011
570 movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
571 1: // loop over 64-byte chunks
572 movdqa 5(%esi,%edx),%xmm1
573 movdqa 21(%esi,%edx),%xmm2
574 movdqa 37(%esi,%edx),%xmm3
575 movdqa 53(%esi,%edx),%xmm4
580 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
581 palignr $11,%xmm2,%xmm3
582 palignr $11,%xmm1,%xmm2
583 palignr $11,%xmm5,%xmm1
585 movdqa %xmm1,(%edi,%edx)
586 movdqa %xmm2,16(%edi,%edx)
587 movdqa %xmm3,32(%edi,%edx)
588 movdqa %xmm4,48(%edi,%edx)
593 jmp Lshort // copy remaining 0..63 bytes and done
596 // Forward loop for medium length operands in which low four bits of %esi == 1100
597 // We use the float single data type in order to use "movss" to merge vectors.
600 movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified
603 1: // loop over 64-byte chunks
604 pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
605 pshufd $(0x93),20(%esi,%edx),%xmm2
606 pshufd $(0x93),36(%esi,%edx),%xmm3
607 pshufd $(0x93),52(%esi,%edx),%xmm4
610 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
615 movaps %xmm1,(%edi,%edx)
616 movaps %xmm2,16(%edi,%edx)
618 movaps %xmm3,32(%edi,%edx)
619 movaps %xmm4,48(%edi,%edx)
624 jmp Lshort // copy remaining 0..63 bytes and done
627 // Forward loop for medium length operands in which low four bits of %esi == 1101
630 movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
631 1: // loop over 64-byte chunks
632 movdqa 3(%esi,%edx),%xmm1
633 movdqa 19(%esi,%edx),%xmm2
634 movdqa 35(%esi,%edx),%xmm3
635 movdqa 51(%esi,%edx),%xmm4
640 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
641 palignr $13,%xmm2,%xmm3
642 palignr $13,%xmm1,%xmm2
643 palignr $13,%xmm5,%xmm1
645 movdqa %xmm1,(%edi,%edx)
646 movdqa %xmm2,16(%edi,%edx)
647 movdqa %xmm3,32(%edi,%edx)
648 movdqa %xmm4,48(%edi,%edx)
653 jmp Lshort // copy remaining 0..63 bytes and done
656 // Forward loop for medium length operands in which low four bits of %esi == 1110
659 movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
660 1: // loop over 64-byte chunks
661 movdqa 2(%esi,%edx),%xmm1
662 movdqa 18(%esi,%edx),%xmm2
663 movdqa 34(%esi,%edx),%xmm3
664 movdqa 50(%esi,%edx),%xmm4
669 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
670 palignr $14,%xmm2,%xmm3
671 palignr $14,%xmm1,%xmm2
672 palignr $14,%xmm5,%xmm1
674 movdqa %xmm1,(%edi,%edx)
675 movdqa %xmm2,16(%edi,%edx)
676 movdqa %xmm3,32(%edi,%edx)
677 movdqa %xmm4,48(%edi,%edx)
682 jmp Lshort // copy remaining 0..63 bytes and done
685 // Forward loop for medium length operands in which low four bits of %esi == 1111
688 movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
689 1: // loop over 64-byte chunks
690 movdqa 1(%esi,%edx),%xmm1
691 movdqa 17(%esi,%edx),%xmm2
692 movdqa 33(%esi,%edx),%xmm3
693 movdqa 49(%esi,%edx),%xmm4
698 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
699 palignr $15,%xmm2,%xmm3
700 palignr $15,%xmm1,%xmm2
701 palignr $15,%xmm5,%xmm1
703 movdqa %xmm1,(%edi,%edx)
704 movdqa %xmm2,16(%edi,%edx)
705 movdqa %xmm3,32(%edi,%edx)
706 movdqa %xmm4,48(%edi,%edx)
711 jmp Lshort // copy remaining 0..63 bytes and done
714 // Reverse moves. These are not optimized as aggressively as their forward
715 // counterparts, as they are only used with destructive overlap.
721 addl %ecx,%esi // point to end of strings
723 cmpl $(kShort),%ecx // long enough to bother with SSE?
724 ja LReverseNotShort // yes
726 // Handle reverse short copies.
728 // esi = one byte past end of source
729 // edi = one byte past end of dest
732 movl %ecx,%edx // copy length
733 shrl $2,%ecx // #words
743 andl $3,%edx // bytes?
753 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
759 // Handle a reverse move long enough to justify using SSE.
761 // esi = one byte past end of source
762 // edi = one byte past end of dest
765 movl %edi,%edx // copy destination
766 andl $15,%edx // get #bytes to align destination
767 je LReverseDestAligned // already aligned
768 subl %edx,%ecx // adjust length
769 1: // loop copying 1..15 bytes
777 // Destination is now aligned. Prepare for reverse loops.
780 movl %ecx,%edx // copy length
781 andl $63,%ecx // get remaining bytes for Lshort
782 andl $-64,%edx // get number of bytes we will copy in inner loop
783 subl %edx,%esi // point to endpoint of copy
785 testl $15,%esi // is source aligned too?
786 jnz LReverseUnalignedLoop // no
788 LReverseAlignedLoop: // loop over 64-byte chunks
789 movdqa -16(%esi,%edx),%xmm0
790 movdqa -32(%esi,%edx),%xmm1
791 movdqa -48(%esi,%edx),%xmm2
792 movdqa -64(%esi,%edx),%xmm3
794 movdqa %xmm0,-16(%edi,%edx)
795 movdqa %xmm1,-32(%edi,%edx)
796 movdqa %xmm2,-48(%edi,%edx)
797 movdqa %xmm3,-64(%edi,%edx)
800 jne LReverseAlignedLoop
802 jmp LReverseShort // copy remaining 0..63 bytes and done
805 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
807 LReverseUnalignedLoop: // loop over 64-byte chunks
808 movdqu -16(%esi,%edx),%xmm0
809 movdqu -32(%esi,%edx),%xmm1
810 movdqu -48(%esi,%edx),%xmm2
811 movdqu -64(%esi,%edx),%xmm3
813 movdqa %xmm0,-16(%edi,%edx)
814 movdqa %xmm1,-32(%edi,%edx)
815 movdqa %xmm2,-48(%edi,%edx)
816 movdqa %xmm3,-64(%edi,%edx)
819 jne LReverseUnalignedLoop
821 jmp LReverseShort // copy remaining 0..63 bytes and done
823 COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)