2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <machine/cpu_capabilities.h>
33 * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
34 * Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version.
36 * The following #defines are tightly coupled to the u-architecture:
39 #define kShort 80 // too short to bother with SSE (must be >=80)
40 #define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB)
41 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
43 // void bcopy(const void *src, void *dst, size_t len);
45 PLATFUNC_FUNCTION_START_GENERIC(bcopy, sse3x, 64, 5)
47 pushq %rbp // set up a frame for backtraces
49 movq %rsi,%rax // copy dest ptr
50 movq %rdi,%rsi // xchange source and dest ptrs
52 subq %rsi,%rax // (dest - source)
53 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
55 cmpq $(kShort),%rdx // long enough to bother with SSE?
60 // void *memcpy(void *dst, const void *src, size_t len);
61 // void *memmove(void *dst, const void *src, size_t len);
64 PLATFUNC_FUNCTION_START_GENERIC(memcpy, sse3x, 64, 0) // void *memcpy(void *dst, const void *src, size_t len)
65 PLATFUNC_FUNCTION_START_GENERIC(memmove, sse3x, 64, 0) // void *memmove(void *dst, const void *src, size_t len)
66 pushq %rbp // set up a frame for backtraces
68 movq %rdi,%r11 // save return value here
70 subq %rsi,%rax // (dest - source)
71 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
73 cmpq $(kShort),%rdx // long enough to bother with SSE?
76 // Handle short forward copies. As the most common case, this is the fall-through path.
77 // rdx = length (<= kShort)
82 movl %edx,%ecx // copy length using 32-bit operation
83 shrl $2,%ecx // get #doublewords
85 2: // loop copying doublewords
92 LLeftovers: // handle leftover bytes (0..3) in last word
93 andl $3,%edx // any leftover bytes?
95 4: // loop copying bytes
103 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
108 LReverseIsland: // keep the "jb" above a short branch...
109 jmp LReverse // ...because reverse moves are uncommon
112 // Handle forward moves that are long enough to justify use of SSE.
113 // First, 16-byte align the destination.
114 // rdx = length (> kShort)
119 cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops?
120 jae LVeryLong // use very-long-operand path
121 movl %edi,%ecx // copy low half of destination ptr
123 andl $15,%ecx // get #bytes to align destination
124 jz LDestAligned // already aligned
125 subl %ecx,%edx // decrement length
126 rep // align destination
130 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
131 // based on the alignment of the source. All vector loads and stores are aligned.
132 // Even though this means we have to shift and repack vectors, doing so is much faster
133 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
134 // there is at least one chunk. When we enter the copy loops, the following registers
136 // rdx = residual length (0..63)
137 // rcx = -(length to move), a multiple of 64 less than 2GB
138 // rsi = ptr to 1st source byte not to move (unaligned)
139 // rdi = ptr to 1st dest byte not to move (aligned)
142 movq %rdx,%rcx // copy length
143 movl %esi,%eax // copy low half of source address
144 andl $63,%edx // get remaining bytes for LShort
145 andl $15,%eax // mask to low 4 bits of source address
146 andq $-64,%rcx // get number of bytes we will copy in inner loop
147 leaq LTable(%rip), %r8
148 addq %rcx,%rsi // point to 1st byte not copied
150 movl (%r8,%rax,4),%eax // get offset of routine
151 negq %rcx // now generate offset to 1st byte to be copied
152 addq %r8,%rax // generate address of copy loop
153 jmp *%rax // enter copy loop, selected by source alignment
156 LTable: // table of copy loop addresses
157 // force generation of assembly-time constants. Otherwise assembler
158 // creates subtractor relocations relative to first external symbol,
159 // and this file has none
160 .set LMod0Offset, LMod0 - LTable
161 .set LMod1Offset, LMod1 - LTable
162 .set LMod2Offset, LMod2 - LTable
163 .set LMod3Offset, LMod3 - LTable
164 .set LMod4Offset, LMod4 - LTable
165 .set LMod5Offset, LMod5 - LTable
166 .set LMod6Offset, LMod6 - LTable
167 .set LMod7Offset, LMod7 - LTable
168 .set LMod8Offset, LMod8 - LTable
169 .set LMod9Offset, LMod9 - LTable
170 .set LMod10Offset, LMod10 - LTable
171 .set LMod11Offset, LMod11 - LTable
172 .set LMod12Offset, LMod12 - LTable
173 .set LMod13Offset, LMod13 - LTable
174 .set LMod14Offset, LMod14 - LTable
175 .set LMod15Offset, LMod15 - LTable
194 // Very long forward moves. These are at least several pages. They are special cased
195 // and aggressively optimized, not so much because they are common or useful, but
196 // because they are subject to benchmark. There isn't enough room for them in the
197 // area reserved on the platfunc for bcopy, so we put them elsewhere. We call
198 // the longcopy routine using the normal ABI:
201 // rdx = length (>= kVeryLong bytes)
204 pushq %r11 // save return value
205 call _longcopy // call very long operand routine
206 popq %rax // pop return value
211 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
212 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
213 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
214 // avoids having to read destination cache lines that will be completely overwritten.
215 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
216 // we do not know if the destination is in cache or not.
219 addq %rcx,%rsi // restore ptrs to 1st byte of source and dest
221 negl %ecx // make length positive (known to be < 2GB)
222 orl %edx,%ecx // restore total #bytes remaining to move
223 cld // we'll move forward
224 shrl $2,%ecx // compute #words to move
225 rep // the u-code will optimize this
227 jmp LLeftovers // handle 0..3 leftover bytes
230 // Forward loop for medium length operands in which low four bits of %rsi == 0000
233 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
234 jle Lfastpath // long enough for fastpath in microcode
236 .align 4,0x90 // 16-byte align inner loops
237 1: // loop over 64-byte chunks
238 movdqa (%rsi,%rcx),%xmm0
239 movdqa 16(%rsi,%rcx),%xmm1
240 movdqa 32(%rsi,%rcx),%xmm2
241 movdqa 48(%rsi,%rcx),%xmm3
243 movdqa %xmm0,(%rdi,%rcx)
244 movdqa %xmm1,16(%rdi,%rcx)
245 movdqa %xmm2,32(%rdi,%rcx)
246 movdqa %xmm3,48(%rdi,%rcx)
251 jmp LShort // copy remaining 0..63 bytes and done
254 // Forward loop for medium length operands in which low four bits of %rsi == 0001
257 movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword
258 1: // loop over 64-byte chunks
259 movdqa 15(%rsi,%rcx),%xmm1
260 movdqa 31(%rsi,%rcx),%xmm2
261 movdqa 47(%rsi,%rcx),%xmm3
262 movdqa 63(%rsi,%rcx),%xmm4
267 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
268 palignr $1,%xmm2,%xmm3
269 palignr $1,%xmm1,%xmm2
270 palignr $1,%xmm5,%xmm1
272 movdqa %xmm1,(%rdi,%rcx)
273 movdqa %xmm2,16(%rdi,%rcx)
274 movdqa %xmm3,32(%rdi,%rcx)
275 movdqa %xmm4,48(%rdi,%rcx)
280 jmp LShort // copy remaining 0..63 bytes and done
283 // Forward loop for medium length operands in which low four bits of %rsi == 0010
286 movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
287 1: // loop over 64-byte chunks
288 movdqa 14(%rsi,%rcx),%xmm1
289 movdqa 30(%rsi,%rcx),%xmm2
290 movdqa 46(%rsi,%rcx),%xmm3
291 movdqa 62(%rsi,%rcx),%xmm4
296 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
297 palignr $2,%xmm2,%xmm3
298 palignr $2,%xmm1,%xmm2
299 palignr $2,%xmm5,%xmm1
301 movdqa %xmm1,(%rdi,%rcx)
302 movdqa %xmm2,16(%rdi,%rcx)
303 movdqa %xmm3,32(%rdi,%rcx)
304 movdqa %xmm4,48(%rdi,%rcx)
309 jmp LShort // copy remaining 0..63 bytes and done
312 // Forward loop for medium length operands in which low four bits of %rsi == 0011
315 movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
316 1: // loop over 64-byte chunks
317 movdqa 13(%rsi,%rcx),%xmm1
318 movdqa 29(%rsi,%rcx),%xmm2
319 movdqa 45(%rsi,%rcx),%xmm3
320 movdqa 61(%rsi,%rcx),%xmm4
325 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
326 palignr $3,%xmm2,%xmm3
327 palignr $3,%xmm1,%xmm2
328 palignr $3,%xmm5,%xmm1
330 movdqa %xmm1,(%rdi,%rcx)
331 movdqa %xmm2,16(%rdi,%rcx)
332 movdqa %xmm3,32(%rdi,%rcx)
333 movdqa %xmm4,48(%rdi,%rcx)
338 jmp LShort // copy remaining 0..63 bytes and done
341 // Forward loop for medium length operands in which low four bits of %rsi == 0100
342 // We use the float single data type in order to use "movss" to merge vectors.
345 movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop
348 1: // loop over 64-byte chunks
349 movaps 12(%rsi,%rcx),%xmm1
350 movaps 28(%rsi,%rcx),%xmm2
351 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
352 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
353 movaps 44(%rsi,%rcx),%xmm3
355 pshufd $(0x39),%xmm1,%xmm1
356 movaps 60(%rsi,%rcx),%xmm4
358 pshufd $(0x39),%xmm2,%xmm2
360 movaps %xmm0,(%rdi,%rcx)
362 pshufd $(0x39),%xmm3,%xmm3
363 movaps %xmm1,16(%rdi,%rcx)
364 movaps %xmm2,32(%rdi,%rcx)
366 movaps %xmm3,48(%rdi,%rcx)
371 jmp LShort // copy remaining 0..63 bytes and done
374 // Forward loop for medium length operands in which low four bits of %rsi == 0101
377 movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
378 1: // loop over 64-byte chunks
379 movdqa 11(%rsi,%rcx),%xmm1
380 movdqa 27(%rsi,%rcx),%xmm2
381 movdqa 43(%rsi,%rcx),%xmm3
382 movdqa 59(%rsi,%rcx),%xmm4
387 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
388 palignr $5,%xmm2,%xmm3
389 palignr $5,%xmm1,%xmm2
390 palignr $5,%xmm5,%xmm1
392 movdqa %xmm1,(%rdi,%rcx)
393 movdqa %xmm2,16(%rdi,%rcx)
394 movdqa %xmm3,32(%rdi,%rcx)
395 movdqa %xmm4,48(%rdi,%rcx)
400 jmp LShort // copy remaining 0..63 bytes and done
403 // Forward loop for medium length operands in which low four bits of %rsi == 0110
406 movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
407 1: // loop over 64-byte chunks
408 movdqa 10(%rsi,%rcx),%xmm1
409 movdqa 26(%rsi,%rcx),%xmm2
410 movdqa 42(%rsi,%rcx),%xmm3
411 movdqa 58(%rsi,%rcx),%xmm4
416 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
417 palignr $6,%xmm2,%xmm3
418 palignr $6,%xmm1,%xmm2
419 palignr $6,%xmm5,%xmm1
421 movdqa %xmm1,(%rdi,%rcx)
422 movdqa %xmm2,16(%rdi,%rcx)
423 movdqa %xmm3,32(%rdi,%rcx)
424 movdqa %xmm4,48(%rdi,%rcx)
429 jmp LShort // copy remaining 0..63 bytes and done
432 // Forward loop for medium length operands in which low four bits of %rsi == 0111
435 movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
436 1: // loop over 64-byte chunks
437 movdqa 9(%rsi,%rcx),%xmm1
438 movdqa 25(%rsi,%rcx),%xmm2
439 movdqa 41(%rsi,%rcx),%xmm3
440 movdqa 57(%rsi,%rcx),%xmm4
445 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
446 palignr $7,%xmm2,%xmm3
447 palignr $7,%xmm1,%xmm2
448 palignr $7,%xmm5,%xmm1
450 movdqa %xmm1,(%rdi,%rcx)
451 movdqa %xmm2,16(%rdi,%rcx)
452 movdqa %xmm3,32(%rdi,%rcx)
453 movdqa %xmm4,48(%rdi,%rcx)
458 jmp LShort // copy remaining 0..63 bytes and done
461 // Forward loop for medium length operands in which low four bits of %rsi == 1000
462 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
465 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
466 jle Lfastpath // long enough for fastpath in microcode
467 movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop
470 1: // loop over 64-byte chunks
471 movapd 8(%rsi,%rcx),%xmm1
472 movapd 24(%rsi,%rcx),%xmm2
473 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
474 movapd 40(%rsi,%rcx),%xmm3
475 shufpd $01,%xmm2,%xmm1
476 movapd 56(%rsi,%rcx),%xmm4
477 shufpd $01,%xmm3,%xmm2
479 movapd %xmm0,(%rdi,%rcx)
480 shufpd $01,%xmm4,%xmm3
481 movapd %xmm1,16(%rdi,%rcx)
482 movapd %xmm2,32(%rdi,%rcx)
484 movapd %xmm3,48(%rdi,%rcx)
489 jmp LShort // copy remaining 0..63 bytes and done
492 // Forward loop for medium length operands in which low four bits of %rsi == 1001
495 movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
496 1: // loop over 64-byte chunks
497 movdqa 7(%rsi,%rcx),%xmm1
498 movdqa 23(%rsi,%rcx),%xmm2
499 movdqa 39(%rsi,%rcx),%xmm3
500 movdqa 55(%rsi,%rcx),%xmm4
505 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
506 palignr $9,%xmm2,%xmm3
507 palignr $9,%xmm1,%xmm2
508 palignr $9,%xmm5,%xmm1
510 movdqa %xmm1,(%rdi,%rcx)
511 movdqa %xmm2,16(%rdi,%rcx)
512 movdqa %xmm3,32(%rdi,%rcx)
513 movdqa %xmm4,48(%rdi,%rcx)
518 jmp LShort // copy remaining 0..63 bytes and done
521 // Forward loop for medium length operands in which low four bits of %rsi == 1010
524 movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
525 1: // loop over 64-byte chunks
526 movdqa 6(%rsi,%rcx),%xmm1
527 movdqa 22(%rsi,%rcx),%xmm2
528 movdqa 38(%rsi,%rcx),%xmm3
529 movdqa 54(%rsi,%rcx),%xmm4
534 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
535 palignr $10,%xmm2,%xmm3
536 palignr $10,%xmm1,%xmm2
537 palignr $10,%xmm5,%xmm1
539 movdqa %xmm1,(%rdi,%rcx)
540 movdqa %xmm2,16(%rdi,%rcx)
541 movdqa %xmm3,32(%rdi,%rcx)
542 movdqa %xmm4,48(%rdi,%rcx)
547 jmp LShort // copy remaining 0..63 bytes and done
550 // Forward loop for medium length operands in which low four bits of %rsi == 1011
553 movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
554 1: // loop over 64-byte chunks
555 movdqa 5(%rsi,%rcx),%xmm1
556 movdqa 21(%rsi,%rcx),%xmm2
557 movdqa 37(%rsi,%rcx),%xmm3
558 movdqa 53(%rsi,%rcx),%xmm4
563 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
564 palignr $11,%xmm2,%xmm3
565 palignr $11,%xmm1,%xmm2
566 palignr $11,%xmm5,%xmm1
568 movdqa %xmm1,(%rdi,%rcx)
569 movdqa %xmm2,16(%rdi,%rcx)
570 movdqa %xmm3,32(%rdi,%rcx)
571 movdqa %xmm4,48(%rdi,%rcx)
576 jmp LShort // copy remaining 0..63 bytes and done
579 // Forward loop for medium length operands in which low four bits of %rsi == 1100
580 // We use the float single data type in order to use "movss" to merge vectors.
583 movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified
586 1: // loop over 64-byte chunks
587 pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
588 pshufd $(0x93),20(%rsi,%rcx),%xmm2
589 pshufd $(0x93),36(%rsi,%rcx),%xmm3
590 pshufd $(0x93),52(%rsi,%rcx),%xmm4
593 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
598 movaps %xmm1,(%rdi,%rcx)
599 movaps %xmm2,16(%rdi,%rcx)
601 movaps %xmm3,32(%rdi,%rcx)
602 movaps %xmm4,48(%rdi,%rcx)
607 jmp LShort // copy remaining 0..63 bytes and done
610 // Forward loop for medium length operands in which low four bits of %rsi == 1101
613 movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
614 1: // loop over 64-byte chunks
615 movdqa 3(%rsi,%rcx),%xmm1
616 movdqa 19(%rsi,%rcx),%xmm2
617 movdqa 35(%rsi,%rcx),%xmm3
618 movdqa 51(%rsi,%rcx),%xmm4
623 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
624 palignr $13,%xmm2,%xmm3
625 palignr $13,%xmm1,%xmm2
626 palignr $13,%xmm5,%xmm1
628 movdqa %xmm1,(%rdi,%rcx)
629 movdqa %xmm2,16(%rdi,%rcx)
630 movdqa %xmm3,32(%rdi,%rcx)
631 movdqa %xmm4,48(%rdi,%rcx)
636 jmp LShort // copy remaining 0..63 bytes and done
639 // Forward loop for medium length operands in which low four bits of %rsi == 1110
642 movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
643 1: // loop over 64-byte chunks
644 movdqa 2(%rsi,%rcx),%xmm1
645 movdqa 18(%rsi,%rcx),%xmm2
646 movdqa 34(%rsi,%rcx),%xmm3
647 movdqa 50(%rsi,%rcx),%xmm4
652 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
653 palignr $14,%xmm2,%xmm3
654 palignr $14,%xmm1,%xmm2
655 palignr $14,%xmm5,%xmm1
657 movdqa %xmm1,(%rdi,%rcx)
658 movdqa %xmm2,16(%rdi,%rcx)
659 movdqa %xmm3,32(%rdi,%rcx)
660 movdqa %xmm4,48(%rdi,%rcx)
665 jmp LShort // copy remaining 0..63 bytes and done
668 // Forward loop for medium length operands in which low four bits of %rsi == 1111
671 movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
672 1: // loop over 64-byte chunks
673 movdqa 1(%rsi,%rcx),%xmm1
674 movdqa 17(%rsi,%rcx),%xmm2
675 movdqa 33(%rsi,%rcx),%xmm3
676 movdqa 49(%rsi,%rcx),%xmm4
681 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
682 palignr $15,%xmm2,%xmm3
683 palignr $15,%xmm1,%xmm2
684 palignr $15,%xmm5,%xmm1
686 movdqa %xmm1,(%rdi,%rcx)
687 movdqa %xmm2,16(%rdi,%rcx)
688 movdqa %xmm3,32(%rdi,%rcx)
689 movdqa %xmm4,48(%rdi,%rcx)
694 jmp LShort // copy remaining 0..63 bytes and done
697 // Reverse moves. These are not optimized as aggressively as their forward
698 // counterparts, as they are only used with destructive overlap.
704 addq %rdx,%rsi // point to end of strings
706 cmpq $(kShort),%rdx // long enough to bother with SSE?
707 ja LReverseNotShort // yes
709 // Handle reverse short copies.
710 // edx = length (<= kShort)
711 // rsi = one byte past end of source
712 // rdi = one byte past end of dest
715 movl %edx,%ecx // copy length
716 shrl $3,%ecx // #quadwords
726 andl $7,%edx // bytes?
736 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
740 // Handle a reverse move long enough to justify using SSE.
741 // rdx = length (> kShort)
742 // rsi = one byte past end of source
743 // rdi = one byte past end of dest
746 movl %edi,%ecx // copy destination
747 andl $15,%ecx // get #bytes to align destination
748 je LReverseDestAligned // already aligned
749 subq %rcx,%rdx // adjust length
750 1: // loop copying 1..15 bytes
758 // Destination is now aligned. Prepare for reverse loops.
761 movq %rdx,%rcx // copy length
762 andl $63,%edx // get remaining bytes for LReverseShort
763 andq $-64,%rcx // get number of bytes we will copy in inner loop
764 subq %rcx,%rsi // point to endpoint of copy
766 testl $15,%esi // is source aligned too?
767 jnz LReverseUnalignedLoop // no
769 LReverseAlignedLoop: // loop over 64-byte chunks
770 movdqa -16(%rsi,%rcx),%xmm0
771 movdqa -32(%rsi,%rcx),%xmm1
772 movdqa -48(%rsi,%rcx),%xmm2
773 movdqa -64(%rsi,%rcx),%xmm3
775 movdqa %xmm0,-16(%rdi,%rcx)
776 movdqa %xmm1,-32(%rdi,%rcx)
777 movdqa %xmm2,-48(%rdi,%rcx)
778 movdqa %xmm3,-64(%rdi,%rcx)
781 jne LReverseAlignedLoop
783 jmp LReverseShort // copy remaining 0..63 bytes and done
786 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
788 LReverseUnalignedLoop: // loop over 64-byte chunks
789 movdqu -16(%rsi,%rcx),%xmm0
790 movdqu -32(%rsi,%rcx),%xmm1
791 movdqu -48(%rsi,%rcx),%xmm2
792 movdqu -64(%rsi,%rcx),%xmm3
794 movdqa %xmm0,-16(%rdi,%rcx)
795 movdqa %xmm1,-32(%rdi,%rcx)
796 movdqa %xmm2,-48(%rdi,%rcx)
797 movdqa %xmm3,-64(%rdi,%rcx)
800 jne LReverseUnalignedLoop
802 jmp LReverseShort // copy remaining 0..63 bytes and done
804 PLATFUNC_DESCRIPTOR(bcopy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
805 PLATFUNC_DESCRIPTOR(memcpy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
806 PLATFUNC_DESCRIPTOR(memmove,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)