2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 #include <machine/cpu_capabilities.h>
24 #include <machine/commpage.h>
27 * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE4
28 * and 64-byte cache lines.
30 * The following #defines are tightly coupled to the u-architecture:
33 #define kShort 80 // too short to bother with SSE (must be >=80)
34 #define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
35 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
38 // void bcopy(const void *src, void *dst, size_t len);
43 Lbcopy_sse4: // void bcopy(const void *src, void *dst, size_t len)
44 pushl %ebp // set up a frame for backtraces
48 movl 8(%ebp),%esi // get source ptr
49 movl 12(%ebp),%edi // get dest ptr
50 movl 16(%ebp),%ecx // get length
52 subl %esi,%edx // (dest - source)
53 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
55 cmpl $(kShort),%ecx // long enough to bother with SSE?
60 // void *memcpy(void *dst, const void *src, size_t len);
61 // void *memmove(void *dst, const void *src, size_t len);
63 // NB: These need to be 32 bytes from bcopy():
67 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
68 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
69 pushl %ebp // set up a frame for backtraces
73 movl 8(%ebp),%edi // get dest ptr
74 movl 12(%ebp),%esi // get source ptr
75 movl 16(%ebp),%ecx // get length
77 subl %esi,%edx // (dest - source)
78 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
80 cmpl $(kShort),%ecx // long enough to bother with SSE?
83 // Handle short forward copies. As the most common case, this is the fall-through path.
84 // ecx = length (<= kShort)
89 movl %ecx,%edx // copy length
90 shrl $2,%ecx // get #doublewords
92 2: // loop copying doublewords
99 LLeftovers: // handle leftover bytes (0..3) in last word
100 andl $3,%edx // any leftover bytes?
102 4: // loop copying bytes
110 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
117 LReverseIsland: // keep the "jb" above a short branch...
118 jmp LReverse // ...because reverse moves are uncommon
121 // Handle forward moves that are long enough to justify use of SSE3.
122 // First, 16-byte align the destination.
123 // ecx = length (> kShort)
128 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
129 movl %edi,%edx // copy destination
130 jae LVeryLong // use very-long-operand path
132 andl $15,%edx // get #bytes to align destination
133 jz LDestAligned // already aligned
134 subl %edx,%ecx // decrement length
135 1: // loop copying 1..15 bytes
143 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
144 // based on the alignment of the source. All vector loads and stores are aligned.
145 // Even though this means we have to shift and repack vectors, doing so is much faster
146 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
147 // there is at least one chunk. When we enter the copy loops, the following registers
149 // ecx = residual length (0..63)
150 // edx = -(length to move), a multiple of 64
151 // esi = ptr to 1st source byte not to move (unaligned)
152 // edi = ptr to 1st dest byte not to move (aligned)
155 movl %ecx,%edx // copy length
156 movl %esi,%eax // copy source address
157 andl $63,%ecx // get remaining bytes for Lshort
158 andl $-64,%edx // get number of bytes we will copy in inner loop
159 andl $15,%eax // mask to low 4 bits of source address
160 addl %edx,%esi // point to 1st byte not copied
162 negl %edx // now generate offset to 1st byte to be copied
163 movl (_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax
167 LTable: // table of copy loop addresses
168 .long LMod0 + _COMM_PAGE_BCOPY - LZero
169 .long LMod1 + _COMM_PAGE_BCOPY - LZero
170 .long LMod2 + _COMM_PAGE_BCOPY - LZero
171 .long LMod3 + _COMM_PAGE_BCOPY - LZero
172 .long LMod4 + _COMM_PAGE_BCOPY - LZero
173 .long LMod5 + _COMM_PAGE_BCOPY - LZero
174 .long LMod6 + _COMM_PAGE_BCOPY - LZero
175 .long LMod7 + _COMM_PAGE_BCOPY - LZero
176 .long LMod8 + _COMM_PAGE_BCOPY - LZero
177 .long LMod9 + _COMM_PAGE_BCOPY - LZero
178 .long LMod10 + _COMM_PAGE_BCOPY - LZero
179 .long LMod11 + _COMM_PAGE_BCOPY - LZero
180 .long LMod12 + _COMM_PAGE_BCOPY - LZero
181 .long LMod13 + _COMM_PAGE_BCOPY - LZero
182 .long LMod14 + _COMM_PAGE_BCOPY - LZero
183 .long LMod15 + _COMM_PAGE_BCOPY - LZero
186 // Very long forward moves. These are at least several pages. They are special cased
187 // and aggressively optimized, not so much because they are common or useful, but
188 // because they are subject to benchmark. There isn't enough room for them in the
189 // area reserved on the commpage for bcopy, so we put them elsewhere. We call
190 // the longcopy routine using the normal ABI.
193 pushl %ecx // length (>= kVeryLong)
194 pushl %esi // source ptr
195 pushl %edi // dest ptr
196 movl $(_COMM_PAGE_LONGCOPY),%eax
197 call *%eax // do the long copy
198 addl $12,%esp // pop off our parameters
202 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
203 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
204 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
205 // avoids having to read destination cache lines that will be completely overwritten.
206 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
207 // we do not know if the destination is in cache or not.
210 addl %edx,%esi // restore ptrs to 1st byte of source and dest
212 negl %edx // make length positive
213 orl %edx,%ecx // restore total #bytes remaining to move
214 cld // we'll move forward
215 movl %ecx,%edx // copy total length to move
216 shrl $2,%ecx // compute #words to move
217 rep // the u-code will optimize this
219 jmp LLeftovers // handle 0..3 leftover bytes
222 // Forward loop for medium length operands in which low four bits of %esi == 0000
225 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
226 jle Lfastpath // long enough for fastpath in microcode
228 .align 4,0x90 // 16-byte align inner loops
229 1: // loop over 64-byte chunks
230 movdqa (%esi,%edx),%xmm0
231 movdqa 16(%esi,%edx),%xmm1
232 movdqa 32(%esi,%edx),%xmm2
233 movdqa 48(%esi,%edx),%xmm3
235 movdqa %xmm0,(%edi,%edx)
236 movdqa %xmm1,16(%edi,%edx)
237 movdqa %xmm2,32(%edi,%edx)
238 movdqa %xmm3,48(%edi,%edx)
243 jmp Lshort // copy remaining 0..63 bytes and done
246 // Forward loop for medium length operands in which low four bits of %esi == 0001
249 movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
250 1: // loop over 64-byte chunks
251 movdqa 15(%esi,%edx),%xmm1
252 movdqa 31(%esi,%edx),%xmm2
253 movdqa 47(%esi,%edx),%xmm3
254 movdqa 63(%esi,%edx),%xmm4
259 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
260 palignr $1,%xmm2,%xmm3
261 palignr $1,%xmm1,%xmm2
262 palignr $1,%xmm5,%xmm1
264 movdqa %xmm1,(%edi,%edx)
265 movdqa %xmm2,16(%edi,%edx)
266 movdqa %xmm3,32(%edi,%edx)
267 movdqa %xmm4,48(%edi,%edx)
272 jmp Lshort // copy remaining 0..63 bytes and done
275 // Forward loop for medium length operands in which low four bits of %esi == 0010
278 movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
279 1: // loop over 64-byte chunks
280 movdqa 14(%esi,%edx),%xmm1
281 movdqa 30(%esi,%edx),%xmm2
282 movdqa 46(%esi,%edx),%xmm3
283 movdqa 62(%esi,%edx),%xmm4
288 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
289 palignr $2,%xmm2,%xmm3
290 palignr $2,%xmm1,%xmm2
291 palignr $2,%xmm5,%xmm1
293 movdqa %xmm1,(%edi,%edx)
294 movdqa %xmm2,16(%edi,%edx)
295 movdqa %xmm3,32(%edi,%edx)
296 movdqa %xmm4,48(%edi,%edx)
301 jmp Lshort // copy remaining 0..63 bytes and done
304 // Forward loop for medium length operands in which low four bits of %esi == 0011
307 movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
308 1: // loop over 64-byte chunks
309 movdqa 13(%esi,%edx),%xmm1
310 movdqa 29(%esi,%edx),%xmm2
311 movdqa 45(%esi,%edx),%xmm3
312 movdqa 61(%esi,%edx),%xmm4
317 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
318 palignr $3,%xmm2,%xmm3
319 palignr $3,%xmm1,%xmm2
320 palignr $3,%xmm5,%xmm1
322 movdqa %xmm1,(%edi,%edx)
323 movdqa %xmm2,16(%edi,%edx)
324 movdqa %xmm3,32(%edi,%edx)
325 movdqa %xmm4,48(%edi,%edx)
330 jmp Lshort // copy remaining 0..63 bytes and done
333 // Forward loop for medium length operands in which low four bits of %esi == 0100
334 // We use the float single data type in order to use "movss" to merge vectors.
337 movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
340 1: // loop over 64-byte chunks
341 movaps 12(%esi,%edx),%xmm1
342 movaps 28(%esi,%edx),%xmm2
343 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
344 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
345 movaps 44(%esi,%edx),%xmm3
347 pshufd $(0x39),%xmm1,%xmm1
348 movaps 60(%esi,%edx),%xmm4
350 pshufd $(0x39),%xmm2,%xmm2
352 movaps %xmm0,(%edi,%edx)
354 pshufd $(0x39),%xmm3,%xmm3
355 movaps %xmm1,16(%edi,%edx)
356 movaps %xmm2,32(%edi,%edx)
358 movaps %xmm3,48(%edi,%edx)
363 jmp Lshort // copy remaining 0..63 bytes and done
366 // Forward loop for medium length operands in which low four bits of %esi == 0101
369 movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
370 1: // loop over 64-byte chunks
371 movdqa 11(%esi,%edx),%xmm1
372 movdqa 27(%esi,%edx),%xmm2
373 movdqa 43(%esi,%edx),%xmm3
374 movdqa 59(%esi,%edx),%xmm4
379 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
380 palignr $5,%xmm2,%xmm3
381 palignr $5,%xmm1,%xmm2
382 palignr $5,%xmm5,%xmm1
384 movdqa %xmm1,(%edi,%edx)
385 movdqa %xmm2,16(%edi,%edx)
386 movdqa %xmm3,32(%edi,%edx)
387 movdqa %xmm4,48(%edi,%edx)
392 jmp Lshort // copy remaining 0..63 bytes and done
395 // Forward loop for medium length operands in which low four bits of %esi == 0110
398 movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
399 1: // loop over 64-byte chunks
400 movdqa 10(%esi,%edx),%xmm1
401 movdqa 26(%esi,%edx),%xmm2
402 movdqa 42(%esi,%edx),%xmm3
403 movdqa 58(%esi,%edx),%xmm4
408 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
409 palignr $6,%xmm2,%xmm3
410 palignr $6,%xmm1,%xmm2
411 palignr $6,%xmm5,%xmm1
413 movdqa %xmm1,(%edi,%edx)
414 movdqa %xmm2,16(%edi,%edx)
415 movdqa %xmm3,32(%edi,%edx)
416 movdqa %xmm4,48(%edi,%edx)
421 jmp Lshort // copy remaining 0..63 bytes and done
424 // Forward loop for medium length operands in which low four bits of %esi == 0111
427 movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
428 1: // loop over 64-byte chunks
429 movdqa 9(%esi,%edx),%xmm1
430 movdqa 25(%esi,%edx),%xmm2
431 movdqa 41(%esi,%edx),%xmm3
432 movdqa 57(%esi,%edx),%xmm4
437 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
438 palignr $7,%xmm2,%xmm3
439 palignr $7,%xmm1,%xmm2
440 palignr $7,%xmm5,%xmm1
442 movdqa %xmm1,(%edi,%edx)
443 movdqa %xmm2,16(%edi,%edx)
444 movdqa %xmm3,32(%edi,%edx)
445 movdqa %xmm4,48(%edi,%edx)
450 jmp Lshort // copy remaining 0..63 bytes and done
453 // Forward loop for medium length operands in which low four bits of %esi == 1000
454 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
457 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
458 jle Lfastpath // long enough for fastpath in microcode
459 movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop
462 1: // loop over 64-byte chunks
463 movapd 8(%esi,%edx),%xmm1
464 movapd 24(%esi,%edx),%xmm2
465 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
466 movapd 40(%esi,%edx),%xmm3
467 shufpd $01,%xmm2,%xmm1
468 movapd 56(%esi,%edx),%xmm4
469 shufpd $01,%xmm3,%xmm2
471 movapd %xmm0,(%edi,%edx)
472 shufpd $01,%xmm4,%xmm3
473 movapd %xmm1,16(%edi,%edx)
474 movapd %xmm2,32(%edi,%edx)
476 movapd %xmm3,48(%edi,%edx)
481 jmp Lshort // copy remaining 0..63 bytes and done
484 // Forward loop for medium length operands in which low four bits of %esi == 1001
487 movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
488 1: // loop over 64-byte chunks
489 movdqa 7(%esi,%edx),%xmm1
490 movdqa 23(%esi,%edx),%xmm2
491 movdqa 39(%esi,%edx),%xmm3
492 movdqa 55(%esi,%edx),%xmm4
497 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
498 palignr $9,%xmm2,%xmm3
499 palignr $9,%xmm1,%xmm2
500 palignr $9,%xmm5,%xmm1
502 movdqa %xmm1,(%edi,%edx)
503 movdqa %xmm2,16(%edi,%edx)
504 movdqa %xmm3,32(%edi,%edx)
505 movdqa %xmm4,48(%edi,%edx)
510 jmp Lshort // copy remaining 0..63 bytes and done
513 // Forward loop for medium length operands in which low four bits of %esi == 1010
516 movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
517 1: // loop over 64-byte chunks
518 movdqa 6(%esi,%edx),%xmm1
519 movdqa 22(%esi,%edx),%xmm2
520 movdqa 38(%esi,%edx),%xmm3
521 movdqa 54(%esi,%edx),%xmm4
526 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
527 palignr $10,%xmm2,%xmm3
528 palignr $10,%xmm1,%xmm2
529 palignr $10,%xmm5,%xmm1
531 movdqa %xmm1,(%edi,%edx)
532 movdqa %xmm2,16(%edi,%edx)
533 movdqa %xmm3,32(%edi,%edx)
534 movdqa %xmm4,48(%edi,%edx)
539 jmp Lshort // copy remaining 0..63 bytes and done
542 // Forward loop for medium length operands in which low four bits of %esi == 1011
545 movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
546 1: // loop over 64-byte chunks
547 movdqa 5(%esi,%edx),%xmm1
548 movdqa 21(%esi,%edx),%xmm2
549 movdqa 37(%esi,%edx),%xmm3
550 movdqa 53(%esi,%edx),%xmm4
555 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
556 palignr $11,%xmm2,%xmm3
557 palignr $11,%xmm1,%xmm2
558 palignr $11,%xmm5,%xmm1
560 movdqa %xmm1,(%edi,%edx)
561 movdqa %xmm2,16(%edi,%edx)
562 movdqa %xmm3,32(%edi,%edx)
563 movdqa %xmm4,48(%edi,%edx)
568 jmp Lshort // copy remaining 0..63 bytes and done
571 // Forward loop for medium length operands in which low four bits of %esi == 1100
572 // We use the float single data type in order to use "movss" to merge vectors.
575 movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified
578 1: // loop over 64-byte chunks
579 pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
580 pshufd $(0x93),20(%esi,%edx),%xmm2
581 pshufd $(0x93),36(%esi,%edx),%xmm3
582 pshufd $(0x93),52(%esi,%edx),%xmm4
585 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
590 movaps %xmm1,(%edi,%edx)
591 movaps %xmm2,16(%edi,%edx)
593 movaps %xmm3,32(%edi,%edx)
594 movaps %xmm4,48(%edi,%edx)
599 jmp Lshort // copy remaining 0..63 bytes and done
602 // Forward loop for medium length operands in which low four bits of %esi == 1101
605 movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
606 1: // loop over 64-byte chunks
607 movdqa 3(%esi,%edx),%xmm1
608 movdqa 19(%esi,%edx),%xmm2
609 movdqa 35(%esi,%edx),%xmm3
610 movdqa 51(%esi,%edx),%xmm4
615 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
616 palignr $13,%xmm2,%xmm3
617 palignr $13,%xmm1,%xmm2
618 palignr $13,%xmm5,%xmm1
620 movdqa %xmm1,(%edi,%edx)
621 movdqa %xmm2,16(%edi,%edx)
622 movdqa %xmm3,32(%edi,%edx)
623 movdqa %xmm4,48(%edi,%edx)
628 jmp Lshort // copy remaining 0..63 bytes and done
631 // Forward loop for medium length operands in which low four bits of %esi == 1110
634 movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
635 1: // loop over 64-byte chunks
636 movdqa 2(%esi,%edx),%xmm1
637 movdqa 18(%esi,%edx),%xmm2
638 movdqa 34(%esi,%edx),%xmm3
639 movdqa 50(%esi,%edx),%xmm4
644 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
645 palignr $14,%xmm2,%xmm3
646 palignr $14,%xmm1,%xmm2
647 palignr $14,%xmm5,%xmm1
649 movdqa %xmm1,(%edi,%edx)
650 movdqa %xmm2,16(%edi,%edx)
651 movdqa %xmm3,32(%edi,%edx)
652 movdqa %xmm4,48(%edi,%edx)
657 jmp Lshort // copy remaining 0..63 bytes and done
660 // Forward loop for medium length operands in which low four bits of %esi == 1111
663 movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
664 1: // loop over 64-byte chunks
665 movdqa 1(%esi,%edx),%xmm1
666 movdqa 17(%esi,%edx),%xmm2
667 movdqa 33(%esi,%edx),%xmm3
668 movdqa 49(%esi,%edx),%xmm4
673 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
674 palignr $15,%xmm2,%xmm3
675 palignr $15,%xmm1,%xmm2
676 palignr $15,%xmm5,%xmm1
678 movdqa %xmm1,(%edi,%edx)
679 movdqa %xmm2,16(%edi,%edx)
680 movdqa %xmm3,32(%edi,%edx)
681 movdqa %xmm4,48(%edi,%edx)
686 jmp Lshort // copy remaining 0..63 bytes and done
689 // Reverse moves. These are not optimized as aggressively as their forward
690 // counterparts, as they are only used with destructive overlap.
696 addl %ecx,%esi // point to end of strings
698 cmpl $(kShort),%ecx // long enough to bother with SSE?
699 ja LReverseNotShort // yes
701 // Handle reverse short copies.
703 // esi = one byte past end of source
704 // edi = one byte past end of dest
707 movl %ecx,%edx // copy length
708 shrl $2,%ecx // #words
718 andl $3,%edx // bytes?
728 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
734 // Handle a reverse move long enough to justify using SSE.
736 // esi = one byte past end of source
737 // edi = one byte past end of dest
740 movl %edi,%edx // copy destination
741 andl $15,%edx // get #bytes to align destination
742 je LReverseDestAligned // already aligned
743 subl %edx,%ecx // adjust length
744 1: // loop copying 1..15 bytes
752 // Destination is now aligned. Prepare for reverse loops.
755 movl %ecx,%edx // copy length
756 andl $63,%ecx // get remaining bytes for Lshort
757 andl $-64,%edx // get number of bytes we will copy in inner loop
758 subl %edx,%esi // point to endpoint of copy
760 testl $15,%esi // is source aligned too?
761 jnz LReverseUnalignedLoop // no
763 LReverseAlignedLoop: // loop over 64-byte chunks
764 movdqa -16(%esi,%edx),%xmm0
765 movdqa -32(%esi,%edx),%xmm1
766 movdqa -48(%esi,%edx),%xmm2
767 movdqa -64(%esi,%edx),%xmm3
769 movdqa %xmm0,-16(%edi,%edx)
770 movdqa %xmm1,-32(%edi,%edx)
771 movdqa %xmm2,-48(%edi,%edx)
772 movdqa %xmm3,-64(%edi,%edx)
775 jne LReverseAlignedLoop
777 jmp LReverseShort // copy remaining 0..63 bytes and done
780 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
782 LReverseUnalignedLoop: // loop over 64-byte chunks
783 movdqu -16(%esi,%edx),%xmm0
784 movdqu -32(%esi,%edx),%xmm1
785 movdqu -48(%esi,%edx),%xmm2
786 movdqu -64(%esi,%edx),%xmm3
788 movdqa %xmm0,-16(%edi,%edx)
789 movdqa %xmm1,-32(%edi,%edx)
790 movdqa %xmm2,-48(%edi,%edx)
791 movdqa %xmm3,-64(%edi,%edx)
794 jne LReverseUnalignedLoop
796 jmp LReverseShort // copy remaining 0..63 bytes and done
799 COMMPAGE_DESCRIPTOR(bcopy_sse4,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)