2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
33 * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE4
34 * and 64-byte cache lines.
36 * The following #defines are tightly coupled to the u-architecture:
39 #define kShort 80 // too short to bother with SSE (must be >=80)
40 #define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
41 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
44 // void bcopy(const void *src, void *dst, size_t len);
49 Lbcopy_sse4: // void bcopy(const void *src, void *dst, size_t len)
50 pushl %ebp // set up a frame for backtraces
54 movl 8(%ebp),%esi // get source ptr
55 movl 12(%ebp),%edi // get dest ptr
56 movl 16(%ebp),%ecx // get length
58 subl %esi,%edx // (dest - source)
59 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
61 cmpl $(kShort),%ecx // long enough to bother with SSE?
66 // void *memcpy(void *dst, const void *src, size_t len);
67 // void *memmove(void *dst, const void *src, size_t len);
69 // NB: These need to be 32 bytes from bcopy():
73 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
74 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
75 pushl %ebp // set up a frame for backtraces
79 movl 8(%ebp),%edi // get dest ptr
80 movl 12(%ebp),%esi // get source ptr
81 movl 16(%ebp),%ecx // get length
83 subl %esi,%edx // (dest - source)
84 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
86 cmpl $(kShort),%ecx // long enough to bother with SSE?
89 // Handle short forward copies. As the most common case, this is the fall-through path.
90 // ecx = length (<= kShort)
95 movl %ecx,%edx // copy length
96 shrl $2,%ecx // get #doublewords
98 2: // loop copying doublewords
105 LLeftovers: // handle leftover bytes (0..3) in last word
106 andl $3,%edx // any leftover bytes?
108 4: // loop copying bytes
116 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
123 LReverseIsland: // keep the "jb" above a short branch...
124 jmp LReverse // ...because reverse moves are uncommon
127 // Handle forward moves that are long enough to justify use of SSE3.
128 // First, 16-byte align the destination.
129 // ecx = length (> kShort)
134 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
135 movl %edi,%edx // copy destination
136 jae LVeryLong // use very-long-operand path
138 andl $15,%edx // get #bytes to align destination
139 jz LDestAligned // already aligned
140 subl %edx,%ecx // decrement length
141 1: // loop copying 1..15 bytes
149 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
150 // based on the alignment of the source. All vector loads and stores are aligned.
151 // Even though this means we have to shift and repack vectors, doing so is much faster
152 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
153 // there is at least one chunk. When we enter the copy loops, the following registers
155 // ecx = residual length (0..63)
156 // edx = -(length to move), a multiple of 64
157 // esi = ptr to 1st source byte not to move (unaligned)
158 // edi = ptr to 1st dest byte not to move (aligned)
161 movl %ecx,%edx // copy length
162 movl %esi,%eax // copy source address
163 andl $63,%ecx // get remaining bytes for Lshort
164 andl $-64,%edx // get number of bytes we will copy in inner loop
165 andl $15,%eax // mask to low 4 bits of source address
166 addl %edx,%esi // point to 1st byte not copied
168 negl %edx // now generate offset to 1st byte to be copied
169 movl (_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax
173 LTable: // table of copy loop addresses
174 .long LMod0 + _COMM_PAGE_BCOPY - LZero
175 .long LMod1 + _COMM_PAGE_BCOPY - LZero
176 .long LMod2 + _COMM_PAGE_BCOPY - LZero
177 .long LMod3 + _COMM_PAGE_BCOPY - LZero
178 .long LMod4 + _COMM_PAGE_BCOPY - LZero
179 .long LMod5 + _COMM_PAGE_BCOPY - LZero
180 .long LMod6 + _COMM_PAGE_BCOPY - LZero
181 .long LMod7 + _COMM_PAGE_BCOPY - LZero
182 .long LMod8 + _COMM_PAGE_BCOPY - LZero
183 .long LMod9 + _COMM_PAGE_BCOPY - LZero
184 .long LMod10 + _COMM_PAGE_BCOPY - LZero
185 .long LMod11 + _COMM_PAGE_BCOPY - LZero
186 .long LMod12 + _COMM_PAGE_BCOPY - LZero
187 .long LMod13 + _COMM_PAGE_BCOPY - LZero
188 .long LMod14 + _COMM_PAGE_BCOPY - LZero
189 .long LMod15 + _COMM_PAGE_BCOPY - LZero
192 // Very long forward moves. These are at least several pages. They are special cased
193 // and aggressively optimized, not so much because they are common or useful, but
194 // because they are subject to benchmark. There isn't enough room for them in the
195 // area reserved on the commpage for bcopy, so we put them elsewhere. We call
196 // the longcopy routine using the normal ABI.
199 pushl %ecx // length (>= kVeryLong)
200 pushl %esi // source ptr
201 pushl %edi // dest ptr
202 movl $(_COMM_PAGE_LONGCOPY),%eax
203 call *%eax // do the long copy
204 addl $12,%esp // pop off our parameters
208 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
209 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
210 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
211 // avoids having to read destination cache lines that will be completely overwritten.
212 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
213 // we do not know if the destination is in cache or not.
216 addl %edx,%esi // restore ptrs to 1st byte of source and dest
218 negl %edx // make length positive
219 orl %edx,%ecx // restore total #bytes remaining to move
220 cld // we'll move forward
221 movl %ecx,%edx // copy total length to move
222 shrl $2,%ecx // compute #words to move
223 rep // the u-code will optimize this
225 jmp LLeftovers // handle 0..3 leftover bytes
228 // Forward loop for medium length operands in which low four bits of %esi == 0000
231 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
232 jle Lfastpath // long enough for fastpath in microcode
234 .align 4,0x90 // 16-byte align inner loops
235 1: // loop over 64-byte chunks
236 movdqa (%esi,%edx),%xmm0
237 movdqa 16(%esi,%edx),%xmm1
238 movdqa 32(%esi,%edx),%xmm2
239 movdqa 48(%esi,%edx),%xmm3
241 movdqa %xmm0,(%edi,%edx)
242 movdqa %xmm1,16(%edi,%edx)
243 movdqa %xmm2,32(%edi,%edx)
244 movdqa %xmm3,48(%edi,%edx)
249 jmp Lshort // copy remaining 0..63 bytes and done
252 // Forward loop for medium length operands in which low four bits of %esi == 0001
255 movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
256 1: // loop over 64-byte chunks
257 movdqa 15(%esi,%edx),%xmm1
258 movdqa 31(%esi,%edx),%xmm2
259 movdqa 47(%esi,%edx),%xmm3
260 movdqa 63(%esi,%edx),%xmm4
265 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
266 palignr $1,%xmm2,%xmm3
267 palignr $1,%xmm1,%xmm2
268 palignr $1,%xmm5,%xmm1
270 movdqa %xmm1,(%edi,%edx)
271 movdqa %xmm2,16(%edi,%edx)
272 movdqa %xmm3,32(%edi,%edx)
273 movdqa %xmm4,48(%edi,%edx)
278 jmp Lshort // copy remaining 0..63 bytes and done
281 // Forward loop for medium length operands in which low four bits of %esi == 0010
284 movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
285 1: // loop over 64-byte chunks
286 movdqa 14(%esi,%edx),%xmm1
287 movdqa 30(%esi,%edx),%xmm2
288 movdqa 46(%esi,%edx),%xmm3
289 movdqa 62(%esi,%edx),%xmm4
294 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
295 palignr $2,%xmm2,%xmm3
296 palignr $2,%xmm1,%xmm2
297 palignr $2,%xmm5,%xmm1
299 movdqa %xmm1,(%edi,%edx)
300 movdqa %xmm2,16(%edi,%edx)
301 movdqa %xmm3,32(%edi,%edx)
302 movdqa %xmm4,48(%edi,%edx)
307 jmp Lshort // copy remaining 0..63 bytes and done
310 // Forward loop for medium length operands in which low four bits of %esi == 0011
313 movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
314 1: // loop over 64-byte chunks
315 movdqa 13(%esi,%edx),%xmm1
316 movdqa 29(%esi,%edx),%xmm2
317 movdqa 45(%esi,%edx),%xmm3
318 movdqa 61(%esi,%edx),%xmm4
323 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
324 palignr $3,%xmm2,%xmm3
325 palignr $3,%xmm1,%xmm2
326 palignr $3,%xmm5,%xmm1
328 movdqa %xmm1,(%edi,%edx)
329 movdqa %xmm2,16(%edi,%edx)
330 movdqa %xmm3,32(%edi,%edx)
331 movdqa %xmm4,48(%edi,%edx)
336 jmp Lshort // copy remaining 0..63 bytes and done
339 // Forward loop for medium length operands in which low four bits of %esi == 0100
340 // We use the float single data type in order to use "movss" to merge vectors.
343 movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
346 1: // loop over 64-byte chunks
347 movaps 12(%esi,%edx),%xmm1
348 movaps 28(%esi,%edx),%xmm2
349 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
350 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
351 movaps 44(%esi,%edx),%xmm3
353 pshufd $(0x39),%xmm1,%xmm1
354 movaps 60(%esi,%edx),%xmm4
356 pshufd $(0x39),%xmm2,%xmm2
358 movaps %xmm0,(%edi,%edx)
360 pshufd $(0x39),%xmm3,%xmm3
361 movaps %xmm1,16(%edi,%edx)
362 movaps %xmm2,32(%edi,%edx)
364 movaps %xmm3,48(%edi,%edx)
369 jmp Lshort // copy remaining 0..63 bytes and done
372 // Forward loop for medium length operands in which low four bits of %esi == 0101
375 movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
376 1: // loop over 64-byte chunks
377 movdqa 11(%esi,%edx),%xmm1
378 movdqa 27(%esi,%edx),%xmm2
379 movdqa 43(%esi,%edx),%xmm3
380 movdqa 59(%esi,%edx),%xmm4
385 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
386 palignr $5,%xmm2,%xmm3
387 palignr $5,%xmm1,%xmm2
388 palignr $5,%xmm5,%xmm1
390 movdqa %xmm1,(%edi,%edx)
391 movdqa %xmm2,16(%edi,%edx)
392 movdqa %xmm3,32(%edi,%edx)
393 movdqa %xmm4,48(%edi,%edx)
398 jmp Lshort // copy remaining 0..63 bytes and done
401 // Forward loop for medium length operands in which low four bits of %esi == 0110
404 movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
405 1: // loop over 64-byte chunks
406 movdqa 10(%esi,%edx),%xmm1
407 movdqa 26(%esi,%edx),%xmm2
408 movdqa 42(%esi,%edx),%xmm3
409 movdqa 58(%esi,%edx),%xmm4
414 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
415 palignr $6,%xmm2,%xmm3
416 palignr $6,%xmm1,%xmm2
417 palignr $6,%xmm5,%xmm1
419 movdqa %xmm1,(%edi,%edx)
420 movdqa %xmm2,16(%edi,%edx)
421 movdqa %xmm3,32(%edi,%edx)
422 movdqa %xmm4,48(%edi,%edx)
427 jmp Lshort // copy remaining 0..63 bytes and done
430 // Forward loop for medium length operands in which low four bits of %esi == 0111
433 movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
434 1: // loop over 64-byte chunks
435 movdqa 9(%esi,%edx),%xmm1
436 movdqa 25(%esi,%edx),%xmm2
437 movdqa 41(%esi,%edx),%xmm3
438 movdqa 57(%esi,%edx),%xmm4
443 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
444 palignr $7,%xmm2,%xmm3
445 palignr $7,%xmm1,%xmm2
446 palignr $7,%xmm5,%xmm1
448 movdqa %xmm1,(%edi,%edx)
449 movdqa %xmm2,16(%edi,%edx)
450 movdqa %xmm3,32(%edi,%edx)
451 movdqa %xmm4,48(%edi,%edx)
456 jmp Lshort // copy remaining 0..63 bytes and done
459 // Forward loop for medium length operands in which low four bits of %esi == 1000
460 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
463 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
464 jle Lfastpath // long enough for fastpath in microcode
465 movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop
468 1: // loop over 64-byte chunks
469 movapd 8(%esi,%edx),%xmm1
470 movapd 24(%esi,%edx),%xmm2
471 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
472 movapd 40(%esi,%edx),%xmm3
473 shufpd $01,%xmm2,%xmm1
474 movapd 56(%esi,%edx),%xmm4
475 shufpd $01,%xmm3,%xmm2
477 movapd %xmm0,(%edi,%edx)
478 shufpd $01,%xmm4,%xmm3
479 movapd %xmm1,16(%edi,%edx)
480 movapd %xmm2,32(%edi,%edx)
482 movapd %xmm3,48(%edi,%edx)
487 jmp Lshort // copy remaining 0..63 bytes and done
490 // Forward loop for medium length operands in which low four bits of %esi == 1001
493 movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
494 1: // loop over 64-byte chunks
495 movdqa 7(%esi,%edx),%xmm1
496 movdqa 23(%esi,%edx),%xmm2
497 movdqa 39(%esi,%edx),%xmm3
498 movdqa 55(%esi,%edx),%xmm4
503 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
504 palignr $9,%xmm2,%xmm3
505 palignr $9,%xmm1,%xmm2
506 palignr $9,%xmm5,%xmm1
508 movdqa %xmm1,(%edi,%edx)
509 movdqa %xmm2,16(%edi,%edx)
510 movdqa %xmm3,32(%edi,%edx)
511 movdqa %xmm4,48(%edi,%edx)
516 jmp Lshort // copy remaining 0..63 bytes and done
519 // Forward loop for medium length operands in which low four bits of %esi == 1010
522 movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
523 1: // loop over 64-byte chunks
524 movdqa 6(%esi,%edx),%xmm1
525 movdqa 22(%esi,%edx),%xmm2
526 movdqa 38(%esi,%edx),%xmm3
527 movdqa 54(%esi,%edx),%xmm4
532 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
533 palignr $10,%xmm2,%xmm3
534 palignr $10,%xmm1,%xmm2
535 palignr $10,%xmm5,%xmm1
537 movdqa %xmm1,(%edi,%edx)
538 movdqa %xmm2,16(%edi,%edx)
539 movdqa %xmm3,32(%edi,%edx)
540 movdqa %xmm4,48(%edi,%edx)
545 jmp Lshort // copy remaining 0..63 bytes and done
548 // Forward loop for medium length operands in which low four bits of %esi == 1011
551 movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
552 1: // loop over 64-byte chunks
553 movdqa 5(%esi,%edx),%xmm1
554 movdqa 21(%esi,%edx),%xmm2
555 movdqa 37(%esi,%edx),%xmm3
556 movdqa 53(%esi,%edx),%xmm4
561 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
562 palignr $11,%xmm2,%xmm3
563 palignr $11,%xmm1,%xmm2
564 palignr $11,%xmm5,%xmm1
566 movdqa %xmm1,(%edi,%edx)
567 movdqa %xmm2,16(%edi,%edx)
568 movdqa %xmm3,32(%edi,%edx)
569 movdqa %xmm4,48(%edi,%edx)
574 jmp Lshort // copy remaining 0..63 bytes and done
577 // Forward loop for medium length operands in which low four bits of %esi == 1100
578 // We use the float single data type in order to use "movss" to merge vectors.
581 movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified
584 1: // loop over 64-byte chunks
585 pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
586 pshufd $(0x93),20(%esi,%edx),%xmm2
587 pshufd $(0x93),36(%esi,%edx),%xmm3
588 pshufd $(0x93),52(%esi,%edx),%xmm4
591 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
596 movaps %xmm1,(%edi,%edx)
597 movaps %xmm2,16(%edi,%edx)
599 movaps %xmm3,32(%edi,%edx)
600 movaps %xmm4,48(%edi,%edx)
605 jmp Lshort // copy remaining 0..63 bytes and done
608 // Forward loop for medium length operands in which low four bits of %esi == 1101
611 movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
612 1: // loop over 64-byte chunks
613 movdqa 3(%esi,%edx),%xmm1
614 movdqa 19(%esi,%edx),%xmm2
615 movdqa 35(%esi,%edx),%xmm3
616 movdqa 51(%esi,%edx),%xmm4
621 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
622 palignr $13,%xmm2,%xmm3
623 palignr $13,%xmm1,%xmm2
624 palignr $13,%xmm5,%xmm1
626 movdqa %xmm1,(%edi,%edx)
627 movdqa %xmm2,16(%edi,%edx)
628 movdqa %xmm3,32(%edi,%edx)
629 movdqa %xmm4,48(%edi,%edx)
634 jmp Lshort // copy remaining 0..63 bytes and done
637 // Forward loop for medium length operands in which low four bits of %esi == 1110
640 movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
641 1: // loop over 64-byte chunks
642 movdqa 2(%esi,%edx),%xmm1
643 movdqa 18(%esi,%edx),%xmm2
644 movdqa 34(%esi,%edx),%xmm3
645 movdqa 50(%esi,%edx),%xmm4
650 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
651 palignr $14,%xmm2,%xmm3
652 palignr $14,%xmm1,%xmm2
653 palignr $14,%xmm5,%xmm1
655 movdqa %xmm1,(%edi,%edx)
656 movdqa %xmm2,16(%edi,%edx)
657 movdqa %xmm3,32(%edi,%edx)
658 movdqa %xmm4,48(%edi,%edx)
663 jmp Lshort // copy remaining 0..63 bytes and done
666 // Forward loop for medium length operands in which low four bits of %esi == 1111
669 movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
670 1: // loop over 64-byte chunks
671 movdqa 1(%esi,%edx),%xmm1
672 movdqa 17(%esi,%edx),%xmm2
673 movdqa 33(%esi,%edx),%xmm3
674 movdqa 49(%esi,%edx),%xmm4
679 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
680 palignr $15,%xmm2,%xmm3
681 palignr $15,%xmm1,%xmm2
682 palignr $15,%xmm5,%xmm1
684 movdqa %xmm1,(%edi,%edx)
685 movdqa %xmm2,16(%edi,%edx)
686 movdqa %xmm3,32(%edi,%edx)
687 movdqa %xmm4,48(%edi,%edx)
692 jmp Lshort // copy remaining 0..63 bytes and done
695 // Reverse moves. These are not optimized as aggressively as their forward
696 // counterparts, as they are only used with destructive overlap.
702 addl %ecx,%esi // point to end of strings
704 cmpl $(kShort),%ecx // long enough to bother with SSE?
705 ja LReverseNotShort // yes
707 // Handle reverse short copies.
709 // esi = one byte past end of source
710 // edi = one byte past end of dest
713 movl %ecx,%edx // copy length
714 shrl $2,%ecx // #words
724 andl $3,%edx // bytes?
734 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
740 // Handle a reverse move long enough to justify using SSE.
742 // esi = one byte past end of source
743 // edi = one byte past end of dest
746 movl %edi,%edx // copy destination
747 andl $15,%edx // get #bytes to align destination
748 je LReverseDestAligned // already aligned
749 subl %edx,%ecx // adjust length
750 1: // loop copying 1..15 bytes
758 // Destination is now aligned. Prepare for reverse loops.
761 movl %ecx,%edx // copy length
762 andl $63,%ecx // get remaining bytes for Lshort
763 andl $-64,%edx // get number of bytes we will copy in inner loop
764 subl %edx,%esi // point to endpoint of copy
766 testl $15,%esi // is source aligned too?
767 jnz LReverseUnalignedLoop // no
769 LReverseAlignedLoop: // loop over 64-byte chunks
770 movdqa -16(%esi,%edx),%xmm0
771 movdqa -32(%esi,%edx),%xmm1
772 movdqa -48(%esi,%edx),%xmm2
773 movdqa -64(%esi,%edx),%xmm3
775 movdqa %xmm0,-16(%edi,%edx)
776 movdqa %xmm1,-32(%edi,%edx)
777 movdqa %xmm2,-48(%edi,%edx)
778 movdqa %xmm3,-64(%edi,%edx)
781 jne LReverseAlignedLoop
783 jmp LReverseShort // copy remaining 0..63 bytes and done
786 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
788 LReverseUnalignedLoop: // loop over 64-byte chunks
789 movdqu -16(%esi,%edx),%xmm0
790 movdqu -32(%esi,%edx),%xmm1
791 movdqu -48(%esi,%edx),%xmm2
792 movdqu -64(%esi,%edx),%xmm3
794 movdqa %xmm0,-16(%edi,%edx)
795 movdqa %xmm1,-32(%edi,%edx)
796 movdqa %xmm2,-48(%edi,%edx)
797 movdqa %xmm3,-64(%edi,%edx)
800 jne LReverseUnalignedLoop
802 jmp LReverseShort // copy remaining 0..63 bytes and done
805 COMMPAGE_DESCRIPTOR(bcopy_sse4,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)