2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 #include <machine/cpu_capabilities.h>
24 #include <machine/commpage.h>
27 * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
28 * SSE4 and 64-byte cache lines. This is the 64-bit version.
30 * The following #defines are tightly coupled to the u-architecture:
33 #define kShort 80 // too short to bother with SSE (must be >=80)
34 #define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB)
35 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
38 // void bcopy(const void *src, void *dst, size_t len);
44 Lbcopy_sse4_64: // void bcopy(const void *src, void *dst, size_t len)
45 pushq %rbp // set up a frame for backtraces
47 movq %rsi,%rax // copy dest ptr
48 movq %rdi,%rsi // xchange source and dest ptrs
50 subq %rsi,%rax // (dest - source)
51 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
53 cmpq $(kShort),%rdx // long enough to bother with SSE?
58 // void *memcpy(void *dst, const void *src, size_t len);
59 // void *memmove(void *dst, const void *src, size_t len);
61 // NB: These need to be 32 bytes from bcopy():
65 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
66 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
67 pushq %rbp // set up a frame for backtraces
69 movq %rdi,%r11 // save return value here
71 subq %rsi,%rax // (dest - source)
72 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
74 cmpq $(kShort),%rdx // long enough to bother with SSE?
77 // Handle short forward copies. As the most common case, this is the fall-through path.
78 // rdx = length (<= kShort)
83 movl %edx,%ecx // copy length using 32-bit operation
84 shrl $2,%ecx // get #doublewords
86 2: // loop copying doublewords
93 LLeftovers: // handle leftover bytes (0..3) in last word
94 andl $3,%edx // any leftover bytes?
96 4: // loop copying bytes
104 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
109 LReverseIsland: // keep the "jb" above a short branch...
110 jmp LReverse // ...because reverse moves are uncommon
113 // Handle forward moves that are long enough to justify use of SSE.
114 // First, 16-byte align the destination.
115 // rdx = length (> kShort)
120 cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops?
121 jae LVeryLong // use very-long-operand path
122 movl %edi,%ecx // copy low half of destination ptr
124 andl $15,%ecx // get #bytes to align destination
125 jz LDestAligned // already aligned
126 subl %ecx,%edx // decrement length
127 rep // align destination
131 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
132 // based on the alignment of the source. All vector loads and stores are aligned.
133 // Even though this means we have to shift and repack vectors, doing so is much faster
134 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
135 // there is at least one chunk. When we enter the copy loops, the following registers
137 // rdx = residual length (0..63)
138 // rcx = -(length to move), a multiple of 64 less than 2GB
139 // rsi = ptr to 1st source byte not to move (unaligned)
140 // rdi = ptr to 1st dest byte not to move (aligned)
143 movl %edx,%ecx // copy length
144 movl %esi,%eax // copy low half of source address
145 andl $63,%edx // get remaining bytes for LShort
146 andl $15,%eax // mask to low 4 bits of source address
147 andl $-64,%ecx // get number of bytes we will copy in inner loop
148 // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block.
149 // lea LTable(%rip),%r8 // point to dispatch table
150 movq $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528
151 addq $(LTable-LZero),%r8 // work around 4586528
152 addq %rcx,%rsi // point to 1st byte not copied
154 movl (%r8,%rax,4),%eax // get offset of routine
155 negq %rcx // now generate offset to 1st byte to be copied
156 addq %r8,%rax // generate address of copy loop
157 jmp *%rax // enter copy loop, selected by source alignment
160 LTable: // table of copy loop addresses
161 .long (LMod0 - LTable)
162 .long (LMod1 - LTable)
163 .long (LMod2 - LTable)
164 .long (LMod3 - LTable)
165 .long (LMod4 - LTable)
166 .long (LMod5 - LTable)
167 .long (LMod6 - LTable)
168 .long (LMod7 - LTable)
169 .long (LMod8 - LTable)
170 .long (LMod9 - LTable)
171 .long (LMod10 - LTable)
172 .long (LMod11 - LTable)
173 .long (LMod12 - LTable)
174 .long (LMod13 - LTable)
175 .long (LMod14 - LTable)
176 .long (LMod15 - LTable)
179 // Very long forward moves. These are at least several pages. They are special cased
180 // and aggressively optimized, not so much because they are common or useful, but
181 // because they are subject to benchmark. There isn't enough room for them in the
182 // area reserved on the commpage for bcopy, so we put them elsewhere. We call
183 // the longcopy routine using the normal ABI:
186 // rdx = length (>= kVeryLong bytes)
189 pushq %r11 // save return value
190 movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
191 call *%rax // call very long operand routine
192 popq %rax // pop return value
197 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
198 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
199 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
200 // avoids having to read destination cache lines that will be completely overwritten.
201 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
202 // we do not know if the destination is in cache or not.
205 addq %rcx,%rsi // restore ptrs to 1st byte of source and dest
207 negl %ecx // make length positive (known to be < 2GB)
208 orl %edx,%ecx // restore total #bytes remaining to move
209 cld // we'll move forward
210 shrl $2,%ecx // compute #words to move
211 rep // the u-code will optimize this
213 jmp LLeftovers // handle 0..3 leftover bytes
216 // Forward loop for medium length operands in which low four bits of %rsi == 0000
219 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
220 jle Lfastpath // long enough for fastpath in microcode
222 .align 4,0x90 // 16-byte align inner loops
223 1: // loop over 64-byte chunks
224 movdqa (%rsi,%rcx),%xmm0
225 movdqa 16(%rsi,%rcx),%xmm1
226 movdqa 32(%rsi,%rcx),%xmm2
227 movdqa 48(%rsi,%rcx),%xmm3
229 movdqa %xmm0,(%rdi,%rcx)
230 movdqa %xmm1,16(%rdi,%rcx)
231 movdqa %xmm2,32(%rdi,%rcx)
232 movdqa %xmm3,48(%rdi,%rcx)
237 jmp LShort // copy remaining 0..63 bytes and done
240 // Forward loop for medium length operands in which low four bits of %rsi == 0001
243 movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword
244 1: // loop over 64-byte chunks
245 movdqa 15(%rsi,%rcx),%xmm1
246 movdqa 31(%rsi,%rcx),%xmm2
247 movdqa 47(%rsi,%rcx),%xmm3
248 movdqa 63(%rsi,%rcx),%xmm4
253 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
254 palignr $1,%xmm2,%xmm3
255 palignr $1,%xmm1,%xmm2
256 palignr $1,%xmm5,%xmm1
258 movdqa %xmm1,(%rdi,%rcx)
259 movdqa %xmm2,16(%rdi,%rcx)
260 movdqa %xmm3,32(%rdi,%rcx)
261 movdqa %xmm4,48(%rdi,%rcx)
266 jmp LShort // copy remaining 0..63 bytes and done
269 // Forward loop for medium length operands in which low four bits of %rsi == 0010
272 movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
273 1: // loop over 64-byte chunks
274 movdqa 14(%rsi,%rcx),%xmm1
275 movdqa 30(%rsi,%rcx),%xmm2
276 movdqa 46(%rsi,%rcx),%xmm3
277 movdqa 62(%rsi,%rcx),%xmm4
282 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
283 palignr $2,%xmm2,%xmm3
284 palignr $2,%xmm1,%xmm2
285 palignr $2,%xmm5,%xmm1
287 movdqa %xmm1,(%rdi,%rcx)
288 movdqa %xmm2,16(%rdi,%rcx)
289 movdqa %xmm3,32(%rdi,%rcx)
290 movdqa %xmm4,48(%rdi,%rcx)
295 jmp LShort // copy remaining 0..63 bytes and done
298 // Forward loop for medium length operands in which low four bits of %rsi == 0011
301 movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
302 1: // loop over 64-byte chunks
303 movdqa 13(%rsi,%rcx),%xmm1
304 movdqa 29(%rsi,%rcx),%xmm2
305 movdqa 45(%rsi,%rcx),%xmm3
306 movdqa 61(%rsi,%rcx),%xmm4
311 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
312 palignr $3,%xmm2,%xmm3
313 palignr $3,%xmm1,%xmm2
314 palignr $3,%xmm5,%xmm1
316 movdqa %xmm1,(%rdi,%rcx)
317 movdqa %xmm2,16(%rdi,%rcx)
318 movdqa %xmm3,32(%rdi,%rcx)
319 movdqa %xmm4,48(%rdi,%rcx)
324 jmp LShort // copy remaining 0..63 bytes and done
327 // Forward loop for medium length operands in which low four bits of %rsi == 0100
328 // We use the float single data type in order to use "movss" to merge vectors.
331 movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop
334 1: // loop over 64-byte chunks
335 movaps 12(%rsi,%rcx),%xmm1
336 movaps 28(%rsi,%rcx),%xmm2
337 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
338 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
339 movaps 44(%rsi,%rcx),%xmm3
341 pshufd $(0x39),%xmm1,%xmm1
342 movaps 60(%rsi,%rcx),%xmm4
344 pshufd $(0x39),%xmm2,%xmm2
346 movaps %xmm0,(%rdi,%rcx)
348 pshufd $(0x39),%xmm3,%xmm3
349 movaps %xmm1,16(%rdi,%rcx)
350 movaps %xmm2,32(%rdi,%rcx)
352 movaps %xmm3,48(%rdi,%rcx)
357 jmp LShort // copy remaining 0..63 bytes and done
360 // Forward loop for medium length operands in which low four bits of %rsi == 0101
363 movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
364 1: // loop over 64-byte chunks
365 movdqa 11(%rsi,%rcx),%xmm1
366 movdqa 27(%rsi,%rcx),%xmm2
367 movdqa 43(%rsi,%rcx),%xmm3
368 movdqa 59(%rsi,%rcx),%xmm4
373 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
374 palignr $5,%xmm2,%xmm3
375 palignr $5,%xmm1,%xmm2
376 palignr $5,%xmm5,%xmm1
378 movdqa %xmm1,(%rdi,%rcx)
379 movdqa %xmm2,16(%rdi,%rcx)
380 movdqa %xmm3,32(%rdi,%rcx)
381 movdqa %xmm4,48(%rdi,%rcx)
386 jmp LShort // copy remaining 0..63 bytes and done
389 // Forward loop for medium length operands in which low four bits of %rsi == 0110
392 movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
393 1: // loop over 64-byte chunks
394 movdqa 10(%rsi,%rcx),%xmm1
395 movdqa 26(%rsi,%rcx),%xmm2
396 movdqa 42(%rsi,%rcx),%xmm3
397 movdqa 58(%rsi,%rcx),%xmm4
402 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
403 palignr $6,%xmm2,%xmm3
404 palignr $6,%xmm1,%xmm2
405 palignr $6,%xmm5,%xmm1
407 movdqa %xmm1,(%rdi,%rcx)
408 movdqa %xmm2,16(%rdi,%rcx)
409 movdqa %xmm3,32(%rdi,%rcx)
410 movdqa %xmm4,48(%rdi,%rcx)
415 jmp LShort // copy remaining 0..63 bytes and done
418 // Forward loop for medium length operands in which low four bits of %rsi == 0111
421 movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
422 1: // loop over 64-byte chunks
423 movdqa 9(%rsi,%rcx),%xmm1
424 movdqa 25(%rsi,%rcx),%xmm2
425 movdqa 41(%rsi,%rcx),%xmm3
426 movdqa 57(%rsi,%rcx),%xmm4
431 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
432 palignr $7,%xmm2,%xmm3
433 palignr $7,%xmm1,%xmm2
434 palignr $7,%xmm5,%xmm1
436 movdqa %xmm1,(%rdi,%rcx)
437 movdqa %xmm2,16(%rdi,%rcx)
438 movdqa %xmm3,32(%rdi,%rcx)
439 movdqa %xmm4,48(%rdi,%rcx)
444 jmp LShort // copy remaining 0..63 bytes and done
447 // Forward loop for medium length operands in which low four bits of %rsi == 1000
448 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
451 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
452 jle Lfastpath // long enough for fastpath in microcode
453 movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop
456 1: // loop over 64-byte chunks
457 movapd 8(%rsi,%rcx),%xmm1
458 movapd 24(%rsi,%rcx),%xmm2
459 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
460 movapd 40(%rsi,%rcx),%xmm3
461 shufpd $01,%xmm2,%xmm1
462 movapd 56(%rsi,%rcx),%xmm4
463 shufpd $01,%xmm3,%xmm2
465 movapd %xmm0,(%rdi,%rcx)
466 shufpd $01,%xmm4,%xmm3
467 movapd %xmm1,16(%rdi,%rcx)
468 movapd %xmm2,32(%rdi,%rcx)
470 movapd %xmm3,48(%rdi,%rcx)
475 jmp LShort // copy remaining 0..63 bytes and done
478 // Forward loop for medium length operands in which low four bits of %rsi == 1001
481 movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
482 1: // loop over 64-byte chunks
483 movdqa 7(%rsi,%rcx),%xmm1
484 movdqa 23(%rsi,%rcx),%xmm2
485 movdqa 39(%rsi,%rcx),%xmm3
486 movdqa 55(%rsi,%rcx),%xmm4
491 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
492 palignr $9,%xmm2,%xmm3
493 palignr $9,%xmm1,%xmm2
494 palignr $9,%xmm5,%xmm1
496 movdqa %xmm1,(%rdi,%rcx)
497 movdqa %xmm2,16(%rdi,%rcx)
498 movdqa %xmm3,32(%rdi,%rcx)
499 movdqa %xmm4,48(%rdi,%rcx)
504 jmp LShort // copy remaining 0..63 bytes and done
507 // Forward loop for medium length operands in which low four bits of %rsi == 1010
510 movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
511 1: // loop over 64-byte chunks
512 movdqa 6(%rsi,%rcx),%xmm1
513 movdqa 22(%rsi,%rcx),%xmm2
514 movdqa 38(%rsi,%rcx),%xmm3
515 movdqa 54(%rsi,%rcx),%xmm4
520 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
521 palignr $10,%xmm2,%xmm3
522 palignr $10,%xmm1,%xmm2
523 palignr $10,%xmm5,%xmm1
525 movdqa %xmm1,(%rdi,%rcx)
526 movdqa %xmm2,16(%rdi,%rcx)
527 movdqa %xmm3,32(%rdi,%rcx)
528 movdqa %xmm4,48(%rdi,%rcx)
533 jmp LShort // copy remaining 0..63 bytes and done
536 // Forward loop for medium length operands in which low four bits of %rsi == 1011
539 movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
540 1: // loop over 64-byte chunks
541 movdqa 5(%rsi,%rcx),%xmm1
542 movdqa 21(%rsi,%rcx),%xmm2
543 movdqa 37(%rsi,%rcx),%xmm3
544 movdqa 53(%rsi,%rcx),%xmm4
549 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
550 palignr $11,%xmm2,%xmm3
551 palignr $11,%xmm1,%xmm2
552 palignr $11,%xmm5,%xmm1
554 movdqa %xmm1,(%rdi,%rcx)
555 movdqa %xmm2,16(%rdi,%rcx)
556 movdqa %xmm3,32(%rdi,%rcx)
557 movdqa %xmm4,48(%rdi,%rcx)
562 jmp LShort // copy remaining 0..63 bytes and done
565 // Forward loop for medium length operands in which low four bits of %rsi == 1100
566 // We use the float single data type in order to use "movss" to merge vectors.
569 movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified
572 1: // loop over 64-byte chunks
573 pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
574 pshufd $(0x93),20(%rsi,%rcx),%xmm2
575 pshufd $(0x93),36(%rsi,%rcx),%xmm3
576 pshufd $(0x93),52(%rsi,%rcx),%xmm4
579 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
584 movaps %xmm1,(%rdi,%rcx)
585 movaps %xmm2,16(%rdi,%rcx)
587 movaps %xmm3,32(%rdi,%rcx)
588 movaps %xmm4,48(%rdi,%rcx)
593 jmp LShort // copy remaining 0..63 bytes and done
596 // Forward loop for medium length operands in which low four bits of %rsi == 1101
599 movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
600 1: // loop over 64-byte chunks
601 movdqa 3(%rsi,%rcx),%xmm1
602 movdqa 19(%rsi,%rcx),%xmm2
603 movdqa 35(%rsi,%rcx),%xmm3
604 movdqa 51(%rsi,%rcx),%xmm4
609 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
610 palignr $13,%xmm2,%xmm3
611 palignr $13,%xmm1,%xmm2
612 palignr $13,%xmm5,%xmm1
614 movdqa %xmm1,(%rdi,%rcx)
615 movdqa %xmm2,16(%rdi,%rcx)
616 movdqa %xmm3,32(%rdi,%rcx)
617 movdqa %xmm4,48(%rdi,%rcx)
622 jmp LShort // copy remaining 0..63 bytes and done
625 // Forward loop for medium length operands in which low four bits of %rsi == 1110
628 movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
629 1: // loop over 64-byte chunks
630 movdqa 2(%rsi,%rcx),%xmm1
631 movdqa 18(%rsi,%rcx),%xmm2
632 movdqa 34(%rsi,%rcx),%xmm3
633 movdqa 50(%rsi,%rcx),%xmm4
638 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
639 palignr $14,%xmm2,%xmm3
640 palignr $14,%xmm1,%xmm2
641 palignr $14,%xmm5,%xmm1
643 movdqa %xmm1,(%rdi,%rcx)
644 movdqa %xmm2,16(%rdi,%rcx)
645 movdqa %xmm3,32(%rdi,%rcx)
646 movdqa %xmm4,48(%rdi,%rcx)
651 jmp LShort // copy remaining 0..63 bytes and done
654 // Forward loop for medium length operands in which low four bits of %rsi == 1111
657 movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
658 1: // loop over 64-byte chunks
659 movdqa 1(%rsi,%rcx),%xmm1
660 movdqa 17(%rsi,%rcx),%xmm2
661 movdqa 33(%rsi,%rcx),%xmm3
662 movdqa 49(%rsi,%rcx),%xmm4
667 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
668 palignr $15,%xmm2,%xmm3
669 palignr $15,%xmm1,%xmm2
670 palignr $15,%xmm5,%xmm1
672 movdqa %xmm1,(%rdi,%rcx)
673 movdqa %xmm2,16(%rdi,%rcx)
674 movdqa %xmm3,32(%rdi,%rcx)
675 movdqa %xmm4,48(%rdi,%rcx)
680 jmp LShort // copy remaining 0..63 bytes and done
683 // Reverse moves. These are not optimized as aggressively as their forward
684 // counterparts, as they are only used with destructive overlap.
690 addq %rdx,%rsi // point to end of strings
692 cmpq $(kShort),%rdx // long enough to bother with SSE?
693 ja LReverseNotShort // yes
695 // Handle reverse short copies.
696 // edx = length (<= kShort)
697 // rsi = one byte past end of source
698 // rdi = one byte past end of dest
701 movl %edx,%ecx // copy length
702 shrl $3,%ecx // #quadwords
712 andl $7,%edx // bytes?
722 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
726 // Handle a reverse move long enough to justify using SSE.
727 // rdx = length (> kShort)
728 // rsi = one byte past end of source
729 // rdi = one byte past end of dest
732 movl %edi,%ecx // copy destination
733 andl $15,%ecx // get #bytes to align destination
734 je LReverseDestAligned // already aligned
735 subq %rcx,%rdx // adjust length
736 1: // loop copying 1..15 bytes
744 // Destination is now aligned. Prepare for reverse loops.
747 movq %rdx,%rcx // copy length
748 andl $63,%edx // get remaining bytes for LReverseShort
749 andq $-64,%rcx // get number of bytes we will copy in inner loop
750 subq %rcx,%rsi // point to endpoint of copy
752 testl $15,%esi // is source aligned too?
753 jnz LReverseUnalignedLoop // no
755 LReverseAlignedLoop: // loop over 64-byte chunks
756 movdqa -16(%rsi,%rcx),%xmm0
757 movdqa -32(%rsi,%rcx),%xmm1
758 movdqa -48(%rsi,%rcx),%xmm2
759 movdqa -64(%rsi,%rcx),%xmm3
761 movdqa %xmm0,-16(%rdi,%rcx)
762 movdqa %xmm1,-32(%rdi,%rcx)
763 movdqa %xmm2,-48(%rdi,%rcx)
764 movdqa %xmm3,-64(%rdi,%rcx)
767 jne LReverseAlignedLoop
769 jmp LReverseShort // copy remaining 0..63 bytes and done
772 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
774 LReverseUnalignedLoop: // loop over 64-byte chunks
775 movdqu -16(%rsi,%rcx),%xmm0
776 movdqu -32(%rsi,%rcx),%xmm1
777 movdqu -48(%rsi,%rcx),%xmm2
778 movdqu -64(%rsi,%rcx),%xmm3
780 movdqa %xmm0,-16(%rdi,%rcx)
781 movdqa %xmm1,-32(%rdi,%rcx)
782 movdqa %xmm2,-48(%rdi,%rcx)
783 movdqa %xmm3,-64(%rdi,%rcx)
786 jne LReverseUnalignedLoop
788 jmp LReverseShort // copy remaining 0..63 bytes and done
791 COMMPAGE_DESCRIPTOR(bcopy_sse4_64,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)