]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/bcopy_sse4_64.s
xnu-792.10.96.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bcopy_sse4_64.s
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #include <machine/cpu_capabilities.h>
24 #include <machine/commpage.h>
25
26 /*
27 * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
28 * SSE4 and 64-byte cache lines. This is the 64-bit version.
29 *
30 * The following #defines are tightly coupled to the u-architecture:
31 */
32
33 #define kShort 80 // too short to bother with SSE (must be >=80)
34 #define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB)
35 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
36
37
38 // void bcopy(const void *src, void *dst, size_t len);
39
40 .text
41 .code64
42 .align 5, 0x90
43 LZero:
44 Lbcopy_sse4_64: // void bcopy(const void *src, void *dst, size_t len)
45 pushq %rbp // set up a frame for backtraces
46 movq %rsp,%rbp
47 movq %rsi,%rax // copy dest ptr
48 movq %rdi,%rsi // xchange source and dest ptrs
49 movq %rax,%rdi
50 subq %rsi,%rax // (dest - source)
51 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
52 jb LReverseIsland
53 cmpq $(kShort),%rdx // long enough to bother with SSE?
54 jbe LShort // no
55 jmp LNotShort
56
57 //
58 // void *memcpy(void *dst, const void *src, size_t len);
59 // void *memmove(void *dst, const void *src, size_t len);
60 //
61 // NB: These need to be 32 bytes from bcopy():
62 //
63
64 .align 5, 0x90
65 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
66 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
67 pushq %rbp // set up a frame for backtraces
68 movq %rsp,%rbp
69 movq %rdi,%r11 // save return value here
70 movq %rdi,%rax
71 subq %rsi,%rax // (dest - source)
72 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
73 jb LReverseIsland
74 cmpq $(kShort),%rdx // long enough to bother with SSE?
75 ja LNotShort // yes
76
77 // Handle short forward copies. As the most common case, this is the fall-through path.
78 // rdx = length (<= kShort)
79 // rsi = source ptr
80 // rdi = dest ptr
81
82 LShort:
83 movl %edx,%ecx // copy length using 32-bit operation
84 shrl $2,%ecx // get #doublewords
85 jz LLeftovers
86 2: // loop copying doublewords
87 movl (%rsi),%eax
88 addq $4,%rsi
89 movl %eax,(%rdi)
90 addq $4,%rdi
91 decl %ecx
92 jnz 2b
93 LLeftovers: // handle leftover bytes (0..3) in last word
94 andl $3,%edx // any leftover bytes?
95 jz 5f
96 4: // loop copying bytes
97 movb (%rsi),%al
98 incq %rsi
99 movb %al,(%rdi)
100 incq %rdi
101 decl %edx
102 jnz 4b
103 5:
104 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
105 popq %rbp
106 ret
107
108
109 LReverseIsland: // keep the "jb" above a short branch...
110 jmp LReverse // ...because reverse moves are uncommon
111
112
113 // Handle forward moves that are long enough to justify use of SSE.
114 // First, 16-byte align the destination.
115 // rdx = length (> kShort)
116 // rsi = source ptr
117 // rdi = dest ptr
118
119 LNotShort:
120 cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops?
121 jae LVeryLong // use very-long-operand path
122 movl %edi,%ecx // copy low half of destination ptr
123 negl %ecx
124 andl $15,%ecx // get #bytes to align destination
125 jz LDestAligned // already aligned
126 subl %ecx,%edx // decrement length
127 rep // align destination
128 movsb
129
130
131 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
132 // based on the alignment of the source. All vector loads and stores are aligned.
133 // Even though this means we have to shift and repack vectors, doing so is much faster
134 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
135 // there is at least one chunk. When we enter the copy loops, the following registers
136 // are set up:
137 // rdx = residual length (0..63)
138 // rcx = -(length to move), a multiple of 64 less than 2GB
139 // rsi = ptr to 1st source byte not to move (unaligned)
140 // rdi = ptr to 1st dest byte not to move (aligned)
141
142 LDestAligned:
143 movl %edx,%ecx // copy length
144 movl %esi,%eax // copy low half of source address
145 andl $63,%edx // get remaining bytes for LShort
146 andl $15,%eax // mask to low 4 bits of source address
147 andl $-64,%ecx // get number of bytes we will copy in inner loop
148 // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block.
149 // lea LTable(%rip),%r8 // point to dispatch table
150 movq $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528
151 addq $(LTable-LZero),%r8 // work around 4586528
152 addq %rcx,%rsi // point to 1st byte not copied
153 addq %rcx,%rdi
154 movl (%r8,%rax,4),%eax // get offset of routine
155 negq %rcx // now generate offset to 1st byte to be copied
156 addq %r8,%rax // generate address of copy loop
157 jmp *%rax // enter copy loop, selected by source alignment
158
159 .align 2
160 LTable: // table of copy loop addresses
161 .long (LMod0 - LTable)
162 .long (LMod1 - LTable)
163 .long (LMod2 - LTable)
164 .long (LMod3 - LTable)
165 .long (LMod4 - LTable)
166 .long (LMod5 - LTable)
167 .long (LMod6 - LTable)
168 .long (LMod7 - LTable)
169 .long (LMod8 - LTable)
170 .long (LMod9 - LTable)
171 .long (LMod10 - LTable)
172 .long (LMod11 - LTable)
173 .long (LMod12 - LTable)
174 .long (LMod13 - LTable)
175 .long (LMod14 - LTable)
176 .long (LMod15 - LTable)
177
178
179 // Very long forward moves. These are at least several pages. They are special cased
180 // and aggressively optimized, not so much because they are common or useful, but
181 // because they are subject to benchmark. There isn't enough room for them in the
182 // area reserved on the commpage for bcopy, so we put them elsewhere. We call
183 // the longcopy routine using the normal ABI:
184 // rdi = dest
185 // rsi = source
186 // rdx = length (>= kVeryLong bytes)
187
188 LVeryLong:
189 pushq %r11 // save return value
190 movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
191 call *%rax // call very long operand routine
192 popq %rax // pop return value
193 popq %rbp
194 ret
195
196
197 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
198 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
199 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
200 // avoids having to read destination cache lines that will be completely overwritten.
201 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
202 // we do not know if the destination is in cache or not.
203
204 Lfastpath:
205 addq %rcx,%rsi // restore ptrs to 1st byte of source and dest
206 addq %rcx,%rdi
207 negl %ecx // make length positive (known to be < 2GB)
208 orl %edx,%ecx // restore total #bytes remaining to move
209 cld // we'll move forward
210 shrl $2,%ecx // compute #words to move
211 rep // the u-code will optimize this
212 movsl
213 jmp LLeftovers // handle 0..3 leftover bytes
214
215
216 // Forward loop for medium length operands in which low four bits of %rsi == 0000
217
218 LMod0:
219 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
220 jle Lfastpath // long enough for fastpath in microcode
221 jmp 1f
222 .align 4,0x90 // 16-byte align inner loops
223 1: // loop over 64-byte chunks
224 movdqa (%rsi,%rcx),%xmm0
225 movdqa 16(%rsi,%rcx),%xmm1
226 movdqa 32(%rsi,%rcx),%xmm2
227 movdqa 48(%rsi,%rcx),%xmm3
228
229 movdqa %xmm0,(%rdi,%rcx)
230 movdqa %xmm1,16(%rdi,%rcx)
231 movdqa %xmm2,32(%rdi,%rcx)
232 movdqa %xmm3,48(%rdi,%rcx)
233
234 addq $64,%rcx
235 jnz 1b
236
237 jmp LShort // copy remaining 0..63 bytes and done
238
239
240 // Forward loop for medium length operands in which low four bits of %rsi == 0001
241
242 LMod1:
243 movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword
244 1: // loop over 64-byte chunks
245 movdqa 15(%rsi,%rcx),%xmm1
246 movdqa 31(%rsi,%rcx),%xmm2
247 movdqa 47(%rsi,%rcx),%xmm3
248 movdqa 63(%rsi,%rcx),%xmm4
249
250 movdqa %xmm0,%xmm5
251 movdqa %xmm4,%xmm0
252
253 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
254 palignr $1,%xmm2,%xmm3
255 palignr $1,%xmm1,%xmm2
256 palignr $1,%xmm5,%xmm1
257
258 movdqa %xmm1,(%rdi,%rcx)
259 movdqa %xmm2,16(%rdi,%rcx)
260 movdqa %xmm3,32(%rdi,%rcx)
261 movdqa %xmm4,48(%rdi,%rcx)
262
263 addq $64,%rcx
264 jnz 1b
265
266 jmp LShort // copy remaining 0..63 bytes and done
267
268
269 // Forward loop for medium length operands in which low four bits of %rsi == 0010
270
271 LMod2:
272 movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
273 1: // loop over 64-byte chunks
274 movdqa 14(%rsi,%rcx),%xmm1
275 movdqa 30(%rsi,%rcx),%xmm2
276 movdqa 46(%rsi,%rcx),%xmm3
277 movdqa 62(%rsi,%rcx),%xmm4
278
279 movdqa %xmm0,%xmm5
280 movdqa %xmm4,%xmm0
281
282 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
283 palignr $2,%xmm2,%xmm3
284 palignr $2,%xmm1,%xmm2
285 palignr $2,%xmm5,%xmm1
286
287 movdqa %xmm1,(%rdi,%rcx)
288 movdqa %xmm2,16(%rdi,%rcx)
289 movdqa %xmm3,32(%rdi,%rcx)
290 movdqa %xmm4,48(%rdi,%rcx)
291
292 addq $64,%rcx
293 jnz 1b
294
295 jmp LShort // copy remaining 0..63 bytes and done
296
297
298 // Forward loop for medium length operands in which low four bits of %rsi == 0011
299
300 LMod3:
301 movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
302 1: // loop over 64-byte chunks
303 movdqa 13(%rsi,%rcx),%xmm1
304 movdqa 29(%rsi,%rcx),%xmm2
305 movdqa 45(%rsi,%rcx),%xmm3
306 movdqa 61(%rsi,%rcx),%xmm4
307
308 movdqa %xmm0,%xmm5
309 movdqa %xmm4,%xmm0
310
311 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
312 palignr $3,%xmm2,%xmm3
313 palignr $3,%xmm1,%xmm2
314 palignr $3,%xmm5,%xmm1
315
316 movdqa %xmm1,(%rdi,%rcx)
317 movdqa %xmm2,16(%rdi,%rcx)
318 movdqa %xmm3,32(%rdi,%rcx)
319 movdqa %xmm4,48(%rdi,%rcx)
320
321 addq $64,%rcx
322 jnz 1b
323
324 jmp LShort // copy remaining 0..63 bytes and done
325
326
327 // Forward loop for medium length operands in which low four bits of %rsi == 0100
328 // We use the float single data type in order to use "movss" to merge vectors.
329
330 LMod4:
331 movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop
332 jmp 1f
333 .align 4,0x90
334 1: // loop over 64-byte chunks
335 movaps 12(%rsi,%rcx),%xmm1
336 movaps 28(%rsi,%rcx),%xmm2
337 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
338 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
339 movaps 44(%rsi,%rcx),%xmm3
340 movss %xmm2,%xmm1
341 pshufd $(0x39),%xmm1,%xmm1
342 movaps 60(%rsi,%rcx),%xmm4
343 movss %xmm3,%xmm2
344 pshufd $(0x39),%xmm2,%xmm2
345
346 movaps %xmm0,(%rdi,%rcx)
347 movss %xmm4,%xmm3
348 pshufd $(0x39),%xmm3,%xmm3
349 movaps %xmm1,16(%rdi,%rcx)
350 movaps %xmm2,32(%rdi,%rcx)
351 movaps %xmm4,%xmm0
352 movaps %xmm3,48(%rdi,%rcx)
353
354 addq $64,%rcx
355 jnz 1b
356
357 jmp LShort // copy remaining 0..63 bytes and done
358
359
360 // Forward loop for medium length operands in which low four bits of %rsi == 0101
361
362 LMod5:
363 movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
364 1: // loop over 64-byte chunks
365 movdqa 11(%rsi,%rcx),%xmm1
366 movdqa 27(%rsi,%rcx),%xmm2
367 movdqa 43(%rsi,%rcx),%xmm3
368 movdqa 59(%rsi,%rcx),%xmm4
369
370 movdqa %xmm0,%xmm5
371 movdqa %xmm4,%xmm0
372
373 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
374 palignr $5,%xmm2,%xmm3
375 palignr $5,%xmm1,%xmm2
376 palignr $5,%xmm5,%xmm1
377
378 movdqa %xmm1,(%rdi,%rcx)
379 movdqa %xmm2,16(%rdi,%rcx)
380 movdqa %xmm3,32(%rdi,%rcx)
381 movdqa %xmm4,48(%rdi,%rcx)
382
383 addq $64,%rcx
384 jnz 1b
385
386 jmp LShort // copy remaining 0..63 bytes and done
387
388
389 // Forward loop for medium length operands in which low four bits of %rsi == 0110
390
391 LMod6:
392 movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
393 1: // loop over 64-byte chunks
394 movdqa 10(%rsi,%rcx),%xmm1
395 movdqa 26(%rsi,%rcx),%xmm2
396 movdqa 42(%rsi,%rcx),%xmm3
397 movdqa 58(%rsi,%rcx),%xmm4
398
399 movdqa %xmm0,%xmm5
400 movdqa %xmm4,%xmm0
401
402 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
403 palignr $6,%xmm2,%xmm3
404 palignr $6,%xmm1,%xmm2
405 palignr $6,%xmm5,%xmm1
406
407 movdqa %xmm1,(%rdi,%rcx)
408 movdqa %xmm2,16(%rdi,%rcx)
409 movdqa %xmm3,32(%rdi,%rcx)
410 movdqa %xmm4,48(%rdi,%rcx)
411
412 addq $64,%rcx
413 jnz 1b
414
415 jmp LShort // copy remaining 0..63 bytes and done
416
417
418 // Forward loop for medium length operands in which low four bits of %rsi == 0111
419
420 LMod7:
421 movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
422 1: // loop over 64-byte chunks
423 movdqa 9(%rsi,%rcx),%xmm1
424 movdqa 25(%rsi,%rcx),%xmm2
425 movdqa 41(%rsi,%rcx),%xmm3
426 movdqa 57(%rsi,%rcx),%xmm4
427
428 movdqa %xmm0,%xmm5
429 movdqa %xmm4,%xmm0
430
431 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
432 palignr $7,%xmm2,%xmm3
433 palignr $7,%xmm1,%xmm2
434 palignr $7,%xmm5,%xmm1
435
436 movdqa %xmm1,(%rdi,%rcx)
437 movdqa %xmm2,16(%rdi,%rcx)
438 movdqa %xmm3,32(%rdi,%rcx)
439 movdqa %xmm4,48(%rdi,%rcx)
440
441 addq $64,%rcx
442 jnz 1b
443
444 jmp LShort // copy remaining 0..63 bytes and done
445
446
447 // Forward loop for medium length operands in which low four bits of %rsi == 1000
448 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
449
450 LMod8:
451 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
452 jle Lfastpath // long enough for fastpath in microcode
453 movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop
454 jmp 1f
455 .align 4,0x90
456 1: // loop over 64-byte chunks
457 movapd 8(%rsi,%rcx),%xmm1
458 movapd 24(%rsi,%rcx),%xmm2
459 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
460 movapd 40(%rsi,%rcx),%xmm3
461 shufpd $01,%xmm2,%xmm1
462 movapd 56(%rsi,%rcx),%xmm4
463 shufpd $01,%xmm3,%xmm2
464
465 movapd %xmm0,(%rdi,%rcx)
466 shufpd $01,%xmm4,%xmm3
467 movapd %xmm1,16(%rdi,%rcx)
468 movapd %xmm2,32(%rdi,%rcx)
469 movapd %xmm4,%xmm0
470 movapd %xmm3,48(%rdi,%rcx)
471
472 addq $64,%rcx
473 jnz 1b
474
475 jmp LShort // copy remaining 0..63 bytes and done
476
477
478 // Forward loop for medium length operands in which low four bits of %rsi == 1001
479
480 LMod9:
481 movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
482 1: // loop over 64-byte chunks
483 movdqa 7(%rsi,%rcx),%xmm1
484 movdqa 23(%rsi,%rcx),%xmm2
485 movdqa 39(%rsi,%rcx),%xmm3
486 movdqa 55(%rsi,%rcx),%xmm4
487
488 movdqa %xmm0,%xmm5
489 movdqa %xmm4,%xmm0
490
491 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
492 palignr $9,%xmm2,%xmm3
493 palignr $9,%xmm1,%xmm2
494 palignr $9,%xmm5,%xmm1
495
496 movdqa %xmm1,(%rdi,%rcx)
497 movdqa %xmm2,16(%rdi,%rcx)
498 movdqa %xmm3,32(%rdi,%rcx)
499 movdqa %xmm4,48(%rdi,%rcx)
500
501 addq $64,%rcx
502 jnz 1b
503
504 jmp LShort // copy remaining 0..63 bytes and done
505
506
507 // Forward loop for medium length operands in which low four bits of %rsi == 1010
508
509 LMod10:
510 movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
511 1: // loop over 64-byte chunks
512 movdqa 6(%rsi,%rcx),%xmm1
513 movdqa 22(%rsi,%rcx),%xmm2
514 movdqa 38(%rsi,%rcx),%xmm3
515 movdqa 54(%rsi,%rcx),%xmm4
516
517 movdqa %xmm0,%xmm5
518 movdqa %xmm4,%xmm0
519
520 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
521 palignr $10,%xmm2,%xmm3
522 palignr $10,%xmm1,%xmm2
523 palignr $10,%xmm5,%xmm1
524
525 movdqa %xmm1,(%rdi,%rcx)
526 movdqa %xmm2,16(%rdi,%rcx)
527 movdqa %xmm3,32(%rdi,%rcx)
528 movdqa %xmm4,48(%rdi,%rcx)
529
530 addq $64,%rcx
531 jnz 1b
532
533 jmp LShort // copy remaining 0..63 bytes and done
534
535
536 // Forward loop for medium length operands in which low four bits of %rsi == 1011
537
538 LMod11:
539 movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
540 1: // loop over 64-byte chunks
541 movdqa 5(%rsi,%rcx),%xmm1
542 movdqa 21(%rsi,%rcx),%xmm2
543 movdqa 37(%rsi,%rcx),%xmm3
544 movdqa 53(%rsi,%rcx),%xmm4
545
546 movdqa %xmm0,%xmm5
547 movdqa %xmm4,%xmm0
548
549 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
550 palignr $11,%xmm2,%xmm3
551 palignr $11,%xmm1,%xmm2
552 palignr $11,%xmm5,%xmm1
553
554 movdqa %xmm1,(%rdi,%rcx)
555 movdqa %xmm2,16(%rdi,%rcx)
556 movdqa %xmm3,32(%rdi,%rcx)
557 movdqa %xmm4,48(%rdi,%rcx)
558
559 addq $64,%rcx
560 jnz 1b
561
562 jmp LShort // copy remaining 0..63 bytes and done
563
564
565 // Forward loop for medium length operands in which low four bits of %rsi == 1100
566 // We use the float single data type in order to use "movss" to merge vectors.
567
568 LMod12:
569 movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified
570 jmp 1f
571 .align 4,0x90
572 1: // loop over 64-byte chunks
573 pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
574 pshufd $(0x93),20(%rsi,%rcx),%xmm2
575 pshufd $(0x93),36(%rsi,%rcx),%xmm3
576 pshufd $(0x93),52(%rsi,%rcx),%xmm4
577
578 movaps %xmm4,%xmm5
579 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
580 movss %xmm2,%xmm3
581 movss %xmm1,%xmm2
582 movss %xmm0,%xmm1
583
584 movaps %xmm1,(%rdi,%rcx)
585 movaps %xmm2,16(%rdi,%rcx)
586 movaps %xmm5,%xmm0
587 movaps %xmm3,32(%rdi,%rcx)
588 movaps %xmm4,48(%rdi,%rcx)
589
590 addq $64,%rcx
591 jnz 1b
592
593 jmp LShort // copy remaining 0..63 bytes and done
594
595
596 // Forward loop for medium length operands in which low four bits of %rsi == 1101
597
598 LMod13:
599 movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
600 1: // loop over 64-byte chunks
601 movdqa 3(%rsi,%rcx),%xmm1
602 movdqa 19(%rsi,%rcx),%xmm2
603 movdqa 35(%rsi,%rcx),%xmm3
604 movdqa 51(%rsi,%rcx),%xmm4
605
606 movdqa %xmm0,%xmm5
607 movdqa %xmm4,%xmm0
608
609 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
610 palignr $13,%xmm2,%xmm3
611 palignr $13,%xmm1,%xmm2
612 palignr $13,%xmm5,%xmm1
613
614 movdqa %xmm1,(%rdi,%rcx)
615 movdqa %xmm2,16(%rdi,%rcx)
616 movdqa %xmm3,32(%rdi,%rcx)
617 movdqa %xmm4,48(%rdi,%rcx)
618
619 addq $64,%rcx
620 jnz 1b
621
622 jmp LShort // copy remaining 0..63 bytes and done
623
624
625 // Forward loop for medium length operands in which low four bits of %rsi == 1110
626
627 LMod14:
628 movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
629 1: // loop over 64-byte chunks
630 movdqa 2(%rsi,%rcx),%xmm1
631 movdqa 18(%rsi,%rcx),%xmm2
632 movdqa 34(%rsi,%rcx),%xmm3
633 movdqa 50(%rsi,%rcx),%xmm4
634
635 movdqa %xmm0,%xmm5
636 movdqa %xmm4,%xmm0
637
638 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
639 palignr $14,%xmm2,%xmm3
640 palignr $14,%xmm1,%xmm2
641 palignr $14,%xmm5,%xmm1
642
643 movdqa %xmm1,(%rdi,%rcx)
644 movdqa %xmm2,16(%rdi,%rcx)
645 movdqa %xmm3,32(%rdi,%rcx)
646 movdqa %xmm4,48(%rdi,%rcx)
647
648 addq $64,%rcx
649 jnz 1b
650
651 jmp LShort // copy remaining 0..63 bytes and done
652
653
654 // Forward loop for medium length operands in which low four bits of %rsi == 1111
655
656 LMod15:
657 movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
658 1: // loop over 64-byte chunks
659 movdqa 1(%rsi,%rcx),%xmm1
660 movdqa 17(%rsi,%rcx),%xmm2
661 movdqa 33(%rsi,%rcx),%xmm3
662 movdqa 49(%rsi,%rcx),%xmm4
663
664 movdqa %xmm0,%xmm5
665 movdqa %xmm4,%xmm0
666
667 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
668 palignr $15,%xmm2,%xmm3
669 palignr $15,%xmm1,%xmm2
670 palignr $15,%xmm5,%xmm1
671
672 movdqa %xmm1,(%rdi,%rcx)
673 movdqa %xmm2,16(%rdi,%rcx)
674 movdqa %xmm3,32(%rdi,%rcx)
675 movdqa %xmm4,48(%rdi,%rcx)
676
677 addq $64,%rcx
678 jnz 1b
679
680 jmp LShort // copy remaining 0..63 bytes and done
681
682
683 // Reverse moves. These are not optimized as aggressively as their forward
684 // counterparts, as they are only used with destructive overlap.
685 // rdx = length
686 // rsi = source ptr
687 // rdi = dest ptr
688
689 LReverse:
690 addq %rdx,%rsi // point to end of strings
691 addq %rdx,%rdi
692 cmpq $(kShort),%rdx // long enough to bother with SSE?
693 ja LReverseNotShort // yes
694
695 // Handle reverse short copies.
696 // edx = length (<= kShort)
697 // rsi = one byte past end of source
698 // rdi = one byte past end of dest
699
700 LReverseShort:
701 movl %edx,%ecx // copy length
702 shrl $3,%ecx // #quadwords
703 jz 3f
704 1:
705 subq $8,%rsi
706 movq (%rsi),%rax
707 subq $8,%rdi
708 movq %rax,(%rdi)
709 decl %ecx
710 jnz 1b
711 3:
712 andl $7,%edx // bytes?
713 jz 5f
714 4:
715 decq %rsi
716 movb (%rsi),%al
717 decq %rdi
718 movb %al,(%rdi)
719 decl %edx
720 jnz 4b
721 5:
722 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
723 popq %rbp
724 ret
725
726 // Handle a reverse move long enough to justify using SSE.
727 // rdx = length (> kShort)
728 // rsi = one byte past end of source
729 // rdi = one byte past end of dest
730
731 LReverseNotShort:
732 movl %edi,%ecx // copy destination
733 andl $15,%ecx // get #bytes to align destination
734 je LReverseDestAligned // already aligned
735 subq %rcx,%rdx // adjust length
736 1: // loop copying 1..15 bytes
737 decq %rsi
738 movb (%rsi),%al
739 decq %rdi
740 movb %al,(%rdi)
741 decl %ecx
742 jnz 1b
743
744 // Destination is now aligned. Prepare for reverse loops.
745
746 LReverseDestAligned:
747 movq %rdx,%rcx // copy length
748 andl $63,%edx // get remaining bytes for LReverseShort
749 andq $-64,%rcx // get number of bytes we will copy in inner loop
750 subq %rcx,%rsi // point to endpoint of copy
751 subq %rcx,%rdi
752 testl $15,%esi // is source aligned too?
753 jnz LReverseUnalignedLoop // no
754
755 LReverseAlignedLoop: // loop over 64-byte chunks
756 movdqa -16(%rsi,%rcx),%xmm0
757 movdqa -32(%rsi,%rcx),%xmm1
758 movdqa -48(%rsi,%rcx),%xmm2
759 movdqa -64(%rsi,%rcx),%xmm3
760
761 movdqa %xmm0,-16(%rdi,%rcx)
762 movdqa %xmm1,-32(%rdi,%rcx)
763 movdqa %xmm2,-48(%rdi,%rcx)
764 movdqa %xmm3,-64(%rdi,%rcx)
765
766 subq $64,%rcx
767 jne LReverseAlignedLoop
768
769 jmp LReverseShort // copy remaining 0..63 bytes and done
770
771
772 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
773
774 LReverseUnalignedLoop: // loop over 64-byte chunks
775 movdqu -16(%rsi,%rcx),%xmm0
776 movdqu -32(%rsi,%rcx),%xmm1
777 movdqu -48(%rsi,%rcx),%xmm2
778 movdqu -64(%rsi,%rcx),%xmm3
779
780 movdqa %xmm0,-16(%rdi,%rcx)
781 movdqa %xmm1,-32(%rdi,%rcx)
782 movdqa %xmm2,-48(%rdi,%rcx)
783 movdqa %xmm3,-64(%rdi,%rcx)
784
785 subq $64,%rcx
786 jne LReverseUnalignedLoop
787
788 jmp LReverseShort // copy remaining 0..63 bytes and done
789
790
791 COMMPAGE_DESCRIPTOR(bcopy_sse4_64,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)