2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
33 * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
34 * SSE4 and 64-byte cache lines. This is the 64-bit version.
36 * The following #defines are tightly coupled to the u-architecture:
39 #define kShort 80 // too short to bother with SSE (must be >=80)
40 #define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB)
41 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
44 // void bcopy(const void *src, void *dst, size_t len);
50 Lbcopy_sse4_64: // void bcopy(const void *src, void *dst, size_t len)
51 pushq %rbp // set up a frame for backtraces
53 movq %rsi,%rax // copy dest ptr
54 movq %rdi,%rsi // xchange source and dest ptrs
56 subq %rsi,%rax // (dest - source)
57 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
59 cmpq $(kShort),%rdx // long enough to bother with SSE?
64 // void *memcpy(void *dst, const void *src, size_t len);
65 // void *memmove(void *dst, const void *src, size_t len);
67 // NB: These need to be 32 bytes from bcopy():
71 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
72 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
73 pushq %rbp // set up a frame for backtraces
75 movq %rdi,%r11 // save return value here
77 subq %rsi,%rax // (dest - source)
78 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
80 cmpq $(kShort),%rdx // long enough to bother with SSE?
83 // Handle short forward copies. As the most common case, this is the fall-through path.
84 // rdx = length (<= kShort)
89 movl %edx,%ecx // copy length using 32-bit operation
90 shrl $2,%ecx // get #doublewords
92 2: // loop copying doublewords
99 LLeftovers: // handle leftover bytes (0..3) in last word
100 andl $3,%edx // any leftover bytes?
102 4: // loop copying bytes
110 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
115 LReverseIsland: // keep the "jb" above a short branch...
116 jmp LReverse // ...because reverse moves are uncommon
119 // Handle forward moves that are long enough to justify use of SSE.
120 // First, 16-byte align the destination.
121 // rdx = length (> kShort)
126 cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops?
127 jae LVeryLong // use very-long-operand path
128 movl %edi,%ecx // copy low half of destination ptr
130 andl $15,%ecx // get #bytes to align destination
131 jz LDestAligned // already aligned
132 subl %ecx,%edx // decrement length
133 rep // align destination
137 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
138 // based on the alignment of the source. All vector loads and stores are aligned.
139 // Even though this means we have to shift and repack vectors, doing so is much faster
140 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
141 // there is at least one chunk. When we enter the copy loops, the following registers
143 // rdx = residual length (0..63)
144 // rcx = -(length to move), a multiple of 64 less than 2GB
145 // rsi = ptr to 1st source byte not to move (unaligned)
146 // rdi = ptr to 1st dest byte not to move (aligned)
149 movl %edx,%ecx // copy length
150 movl %esi,%eax // copy low half of source address
151 andl $63,%edx // get remaining bytes for LShort
152 andl $15,%eax // mask to low 4 bits of source address
153 andl $-64,%ecx // get number of bytes we will copy in inner loop
154 // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block.
155 // lea LTable(%rip),%r8 // point to dispatch table
156 movq $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528
157 addq $(LTable-LZero),%r8 // work around 4586528
158 addq %rcx,%rsi // point to 1st byte not copied
160 movl (%r8,%rax,4),%eax // get offset of routine
161 negq %rcx // now generate offset to 1st byte to be copied
162 addq %r8,%rax // generate address of copy loop
163 jmp *%rax // enter copy loop, selected by source alignment
166 LTable: // table of copy loop addresses
167 .long (LMod0 - LTable)
168 .long (LMod1 - LTable)
169 .long (LMod2 - LTable)
170 .long (LMod3 - LTable)
171 .long (LMod4 - LTable)
172 .long (LMod5 - LTable)
173 .long (LMod6 - LTable)
174 .long (LMod7 - LTable)
175 .long (LMod8 - LTable)
176 .long (LMod9 - LTable)
177 .long (LMod10 - LTable)
178 .long (LMod11 - LTable)
179 .long (LMod12 - LTable)
180 .long (LMod13 - LTable)
181 .long (LMod14 - LTable)
182 .long (LMod15 - LTable)
185 // Very long forward moves. These are at least several pages. They are special cased
186 // and aggressively optimized, not so much because they are common or useful, but
187 // because they are subject to benchmark. There isn't enough room for them in the
188 // area reserved on the commpage for bcopy, so we put them elsewhere. We call
189 // the longcopy routine using the normal ABI:
192 // rdx = length (>= kVeryLong bytes)
195 pushq %r11 // save return value
196 movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
197 call *%rax // call very long operand routine
198 popq %rax // pop return value
203 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
204 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
205 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
206 // avoids having to read destination cache lines that will be completely overwritten.
207 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
208 // we do not know if the destination is in cache or not.
211 addq %rcx,%rsi // restore ptrs to 1st byte of source and dest
213 negl %ecx // make length positive (known to be < 2GB)
214 orl %edx,%ecx // restore total #bytes remaining to move
215 cld // we'll move forward
216 shrl $2,%ecx // compute #words to move
217 rep // the u-code will optimize this
219 jmp LLeftovers // handle 0..3 leftover bytes
222 // Forward loop for medium length operands in which low four bits of %rsi == 0000
225 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
226 jle Lfastpath // long enough for fastpath in microcode
228 .align 4,0x90 // 16-byte align inner loops
229 1: // loop over 64-byte chunks
230 movdqa (%rsi,%rcx),%xmm0
231 movdqa 16(%rsi,%rcx),%xmm1
232 movdqa 32(%rsi,%rcx),%xmm2
233 movdqa 48(%rsi,%rcx),%xmm3
235 movdqa %xmm0,(%rdi,%rcx)
236 movdqa %xmm1,16(%rdi,%rcx)
237 movdqa %xmm2,32(%rdi,%rcx)
238 movdqa %xmm3,48(%rdi,%rcx)
243 jmp LShort // copy remaining 0..63 bytes and done
246 // Forward loop for medium length operands in which low four bits of %rsi == 0001
249 movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword
250 1: // loop over 64-byte chunks
251 movdqa 15(%rsi,%rcx),%xmm1
252 movdqa 31(%rsi,%rcx),%xmm2
253 movdqa 47(%rsi,%rcx),%xmm3
254 movdqa 63(%rsi,%rcx),%xmm4
259 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
260 palignr $1,%xmm2,%xmm3
261 palignr $1,%xmm1,%xmm2
262 palignr $1,%xmm5,%xmm1
264 movdqa %xmm1,(%rdi,%rcx)
265 movdqa %xmm2,16(%rdi,%rcx)
266 movdqa %xmm3,32(%rdi,%rcx)
267 movdqa %xmm4,48(%rdi,%rcx)
272 jmp LShort // copy remaining 0..63 bytes and done
275 // Forward loop for medium length operands in which low four bits of %rsi == 0010
278 movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
279 1: // loop over 64-byte chunks
280 movdqa 14(%rsi,%rcx),%xmm1
281 movdqa 30(%rsi,%rcx),%xmm2
282 movdqa 46(%rsi,%rcx),%xmm3
283 movdqa 62(%rsi,%rcx),%xmm4
288 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
289 palignr $2,%xmm2,%xmm3
290 palignr $2,%xmm1,%xmm2
291 palignr $2,%xmm5,%xmm1
293 movdqa %xmm1,(%rdi,%rcx)
294 movdqa %xmm2,16(%rdi,%rcx)
295 movdqa %xmm3,32(%rdi,%rcx)
296 movdqa %xmm4,48(%rdi,%rcx)
301 jmp LShort // copy remaining 0..63 bytes and done
304 // Forward loop for medium length operands in which low four bits of %rsi == 0011
307 movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
308 1: // loop over 64-byte chunks
309 movdqa 13(%rsi,%rcx),%xmm1
310 movdqa 29(%rsi,%rcx),%xmm2
311 movdqa 45(%rsi,%rcx),%xmm3
312 movdqa 61(%rsi,%rcx),%xmm4
317 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
318 palignr $3,%xmm2,%xmm3
319 palignr $3,%xmm1,%xmm2
320 palignr $3,%xmm5,%xmm1
322 movdqa %xmm1,(%rdi,%rcx)
323 movdqa %xmm2,16(%rdi,%rcx)
324 movdqa %xmm3,32(%rdi,%rcx)
325 movdqa %xmm4,48(%rdi,%rcx)
330 jmp LShort // copy remaining 0..63 bytes and done
333 // Forward loop for medium length operands in which low four bits of %rsi == 0100
334 // We use the float single data type in order to use "movss" to merge vectors.
337 movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop
340 1: // loop over 64-byte chunks
341 movaps 12(%rsi,%rcx),%xmm1
342 movaps 28(%rsi,%rcx),%xmm2
343 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
344 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
345 movaps 44(%rsi,%rcx),%xmm3
347 pshufd $(0x39),%xmm1,%xmm1
348 movaps 60(%rsi,%rcx),%xmm4
350 pshufd $(0x39),%xmm2,%xmm2
352 movaps %xmm0,(%rdi,%rcx)
354 pshufd $(0x39),%xmm3,%xmm3
355 movaps %xmm1,16(%rdi,%rcx)
356 movaps %xmm2,32(%rdi,%rcx)
358 movaps %xmm3,48(%rdi,%rcx)
363 jmp LShort // copy remaining 0..63 bytes and done
366 // Forward loop for medium length operands in which low four bits of %rsi == 0101
369 movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
370 1: // loop over 64-byte chunks
371 movdqa 11(%rsi,%rcx),%xmm1
372 movdqa 27(%rsi,%rcx),%xmm2
373 movdqa 43(%rsi,%rcx),%xmm3
374 movdqa 59(%rsi,%rcx),%xmm4
379 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
380 palignr $5,%xmm2,%xmm3
381 palignr $5,%xmm1,%xmm2
382 palignr $5,%xmm5,%xmm1
384 movdqa %xmm1,(%rdi,%rcx)
385 movdqa %xmm2,16(%rdi,%rcx)
386 movdqa %xmm3,32(%rdi,%rcx)
387 movdqa %xmm4,48(%rdi,%rcx)
392 jmp LShort // copy remaining 0..63 bytes and done
395 // Forward loop for medium length operands in which low four bits of %rsi == 0110
398 movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
399 1: // loop over 64-byte chunks
400 movdqa 10(%rsi,%rcx),%xmm1
401 movdqa 26(%rsi,%rcx),%xmm2
402 movdqa 42(%rsi,%rcx),%xmm3
403 movdqa 58(%rsi,%rcx),%xmm4
408 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
409 palignr $6,%xmm2,%xmm3
410 palignr $6,%xmm1,%xmm2
411 palignr $6,%xmm5,%xmm1
413 movdqa %xmm1,(%rdi,%rcx)
414 movdqa %xmm2,16(%rdi,%rcx)
415 movdqa %xmm3,32(%rdi,%rcx)
416 movdqa %xmm4,48(%rdi,%rcx)
421 jmp LShort // copy remaining 0..63 bytes and done
424 // Forward loop for medium length operands in which low four bits of %rsi == 0111
427 movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
428 1: // loop over 64-byte chunks
429 movdqa 9(%rsi,%rcx),%xmm1
430 movdqa 25(%rsi,%rcx),%xmm2
431 movdqa 41(%rsi,%rcx),%xmm3
432 movdqa 57(%rsi,%rcx),%xmm4
437 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
438 palignr $7,%xmm2,%xmm3
439 palignr $7,%xmm1,%xmm2
440 palignr $7,%xmm5,%xmm1
442 movdqa %xmm1,(%rdi,%rcx)
443 movdqa %xmm2,16(%rdi,%rcx)
444 movdqa %xmm3,32(%rdi,%rcx)
445 movdqa %xmm4,48(%rdi,%rcx)
450 jmp LShort // copy remaining 0..63 bytes and done
453 // Forward loop for medium length operands in which low four bits of %rsi == 1000
454 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
457 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
458 jle Lfastpath // long enough for fastpath in microcode
459 movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop
462 1: // loop over 64-byte chunks
463 movapd 8(%rsi,%rcx),%xmm1
464 movapd 24(%rsi,%rcx),%xmm2
465 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
466 movapd 40(%rsi,%rcx),%xmm3
467 shufpd $01,%xmm2,%xmm1
468 movapd 56(%rsi,%rcx),%xmm4
469 shufpd $01,%xmm3,%xmm2
471 movapd %xmm0,(%rdi,%rcx)
472 shufpd $01,%xmm4,%xmm3
473 movapd %xmm1,16(%rdi,%rcx)
474 movapd %xmm2,32(%rdi,%rcx)
476 movapd %xmm3,48(%rdi,%rcx)
481 jmp LShort // copy remaining 0..63 bytes and done
484 // Forward loop for medium length operands in which low four bits of %rsi == 1001
487 movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
488 1: // loop over 64-byte chunks
489 movdqa 7(%rsi,%rcx),%xmm1
490 movdqa 23(%rsi,%rcx),%xmm2
491 movdqa 39(%rsi,%rcx),%xmm3
492 movdqa 55(%rsi,%rcx),%xmm4
497 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
498 palignr $9,%xmm2,%xmm3
499 palignr $9,%xmm1,%xmm2
500 palignr $9,%xmm5,%xmm1
502 movdqa %xmm1,(%rdi,%rcx)
503 movdqa %xmm2,16(%rdi,%rcx)
504 movdqa %xmm3,32(%rdi,%rcx)
505 movdqa %xmm4,48(%rdi,%rcx)
510 jmp LShort // copy remaining 0..63 bytes and done
513 // Forward loop for medium length operands in which low four bits of %rsi == 1010
516 movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
517 1: // loop over 64-byte chunks
518 movdqa 6(%rsi,%rcx),%xmm1
519 movdqa 22(%rsi,%rcx),%xmm2
520 movdqa 38(%rsi,%rcx),%xmm3
521 movdqa 54(%rsi,%rcx),%xmm4
526 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
527 palignr $10,%xmm2,%xmm3
528 palignr $10,%xmm1,%xmm2
529 palignr $10,%xmm5,%xmm1
531 movdqa %xmm1,(%rdi,%rcx)
532 movdqa %xmm2,16(%rdi,%rcx)
533 movdqa %xmm3,32(%rdi,%rcx)
534 movdqa %xmm4,48(%rdi,%rcx)
539 jmp LShort // copy remaining 0..63 bytes and done
542 // Forward loop for medium length operands in which low four bits of %rsi == 1011
545 movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
546 1: // loop over 64-byte chunks
547 movdqa 5(%rsi,%rcx),%xmm1
548 movdqa 21(%rsi,%rcx),%xmm2
549 movdqa 37(%rsi,%rcx),%xmm3
550 movdqa 53(%rsi,%rcx),%xmm4
555 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
556 palignr $11,%xmm2,%xmm3
557 palignr $11,%xmm1,%xmm2
558 palignr $11,%xmm5,%xmm1
560 movdqa %xmm1,(%rdi,%rcx)
561 movdqa %xmm2,16(%rdi,%rcx)
562 movdqa %xmm3,32(%rdi,%rcx)
563 movdqa %xmm4,48(%rdi,%rcx)
568 jmp LShort // copy remaining 0..63 bytes and done
571 // Forward loop for medium length operands in which low four bits of %rsi == 1100
572 // We use the float single data type in order to use "movss" to merge vectors.
575 movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified
578 1: // loop over 64-byte chunks
579 pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
580 pshufd $(0x93),20(%rsi,%rcx),%xmm2
581 pshufd $(0x93),36(%rsi,%rcx),%xmm3
582 pshufd $(0x93),52(%rsi,%rcx),%xmm4
585 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
590 movaps %xmm1,(%rdi,%rcx)
591 movaps %xmm2,16(%rdi,%rcx)
593 movaps %xmm3,32(%rdi,%rcx)
594 movaps %xmm4,48(%rdi,%rcx)
599 jmp LShort // copy remaining 0..63 bytes and done
602 // Forward loop for medium length operands in which low four bits of %rsi == 1101
605 movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
606 1: // loop over 64-byte chunks
607 movdqa 3(%rsi,%rcx),%xmm1
608 movdqa 19(%rsi,%rcx),%xmm2
609 movdqa 35(%rsi,%rcx),%xmm3
610 movdqa 51(%rsi,%rcx),%xmm4
615 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
616 palignr $13,%xmm2,%xmm3
617 palignr $13,%xmm1,%xmm2
618 palignr $13,%xmm5,%xmm1
620 movdqa %xmm1,(%rdi,%rcx)
621 movdqa %xmm2,16(%rdi,%rcx)
622 movdqa %xmm3,32(%rdi,%rcx)
623 movdqa %xmm4,48(%rdi,%rcx)
628 jmp LShort // copy remaining 0..63 bytes and done
631 // Forward loop for medium length operands in which low four bits of %rsi == 1110
634 movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
635 1: // loop over 64-byte chunks
636 movdqa 2(%rsi,%rcx),%xmm1
637 movdqa 18(%rsi,%rcx),%xmm2
638 movdqa 34(%rsi,%rcx),%xmm3
639 movdqa 50(%rsi,%rcx),%xmm4
644 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
645 palignr $14,%xmm2,%xmm3
646 palignr $14,%xmm1,%xmm2
647 palignr $14,%xmm5,%xmm1
649 movdqa %xmm1,(%rdi,%rcx)
650 movdqa %xmm2,16(%rdi,%rcx)
651 movdqa %xmm3,32(%rdi,%rcx)
652 movdqa %xmm4,48(%rdi,%rcx)
657 jmp LShort // copy remaining 0..63 bytes and done
660 // Forward loop for medium length operands in which low four bits of %rsi == 1111
663 movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
664 1: // loop over 64-byte chunks
665 movdqa 1(%rsi,%rcx),%xmm1
666 movdqa 17(%rsi,%rcx),%xmm2
667 movdqa 33(%rsi,%rcx),%xmm3
668 movdqa 49(%rsi,%rcx),%xmm4
673 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
674 palignr $15,%xmm2,%xmm3
675 palignr $15,%xmm1,%xmm2
676 palignr $15,%xmm5,%xmm1
678 movdqa %xmm1,(%rdi,%rcx)
679 movdqa %xmm2,16(%rdi,%rcx)
680 movdqa %xmm3,32(%rdi,%rcx)
681 movdqa %xmm4,48(%rdi,%rcx)
686 jmp LShort // copy remaining 0..63 bytes and done
689 // Reverse moves. These are not optimized as aggressively as their forward
690 // counterparts, as they are only used with destructive overlap.
696 addq %rdx,%rsi // point to end of strings
698 cmpq $(kShort),%rdx // long enough to bother with SSE?
699 ja LReverseNotShort // yes
701 // Handle reverse short copies.
702 // edx = length (<= kShort)
703 // rsi = one byte past end of source
704 // rdi = one byte past end of dest
707 movl %edx,%ecx // copy length
708 shrl $3,%ecx // #quadwords
718 andl $7,%edx // bytes?
728 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
732 // Handle a reverse move long enough to justify using SSE.
733 // rdx = length (> kShort)
734 // rsi = one byte past end of source
735 // rdi = one byte past end of dest
738 movl %edi,%ecx // copy destination
739 andl $15,%ecx // get #bytes to align destination
740 je LReverseDestAligned // already aligned
741 subq %rcx,%rdx // adjust length
742 1: // loop copying 1..15 bytes
750 // Destination is now aligned. Prepare for reverse loops.
753 movq %rdx,%rcx // copy length
754 andl $63,%edx // get remaining bytes for LReverseShort
755 andq $-64,%rcx // get number of bytes we will copy in inner loop
756 subq %rcx,%rsi // point to endpoint of copy
758 testl $15,%esi // is source aligned too?
759 jnz LReverseUnalignedLoop // no
761 LReverseAlignedLoop: // loop over 64-byte chunks
762 movdqa -16(%rsi,%rcx),%xmm0
763 movdqa -32(%rsi,%rcx),%xmm1
764 movdqa -48(%rsi,%rcx),%xmm2
765 movdqa -64(%rsi,%rcx),%xmm3
767 movdqa %xmm0,-16(%rdi,%rcx)
768 movdqa %xmm1,-32(%rdi,%rcx)
769 movdqa %xmm2,-48(%rdi,%rcx)
770 movdqa %xmm3,-64(%rdi,%rcx)
773 jne LReverseAlignedLoop
775 jmp LReverseShort // copy remaining 0..63 bytes and done
778 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
780 LReverseUnalignedLoop: // loop over 64-byte chunks
781 movdqu -16(%rsi,%rcx),%xmm0
782 movdqu -32(%rsi,%rcx),%xmm1
783 movdqu -48(%rsi,%rcx),%xmm2
784 movdqu -64(%rsi,%rcx),%xmm3
786 movdqa %xmm0,-16(%rdi,%rcx)
787 movdqa %xmm1,-32(%rdi,%rcx)
788 movdqa %xmm2,-48(%rdi,%rcx)
789 movdqa %xmm3,-64(%rdi,%rcx)
792 jne LReverseUnalignedLoop
794 jmp LReverseShort // copy remaining 0..63 bytes and done
797 COMMPAGE_DESCRIPTOR(bcopy_sse4_64,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)