]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/bcopy_sse4_64.s
xnu-792.18.15.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bcopy_sse4_64.s
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
31
32 /*
33 * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
34 * SSE4 and 64-byte cache lines. This is the 64-bit version.
35 *
36 * The following #defines are tightly coupled to the u-architecture:
37 */
38
39 #define kShort 80 // too short to bother with SSE (must be >=80)
40 #define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB)
41 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
42
43
44 // void bcopy(const void *src, void *dst, size_t len);
45
46 .text
47 .code64
48 .align 5, 0x90
49 LZero:
50 Lbcopy_sse4_64: // void bcopy(const void *src, void *dst, size_t len)
51 pushq %rbp // set up a frame for backtraces
52 movq %rsp,%rbp
53 movq %rsi,%rax // copy dest ptr
54 movq %rdi,%rsi // xchange source and dest ptrs
55 movq %rax,%rdi
56 subq %rsi,%rax // (dest - source)
57 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
58 jb LReverseIsland
59 cmpq $(kShort),%rdx // long enough to bother with SSE?
60 jbe LShort // no
61 jmp LNotShort
62
63 //
64 // void *memcpy(void *dst, const void *src, size_t len);
65 // void *memmove(void *dst, const void *src, size_t len);
66 //
67 // NB: These need to be 32 bytes from bcopy():
68 //
69
70 .align 5, 0x90
71 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
72 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
73 pushq %rbp // set up a frame for backtraces
74 movq %rsp,%rbp
75 movq %rdi,%r11 // save return value here
76 movq %rdi,%rax
77 subq %rsi,%rax // (dest - source)
78 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
79 jb LReverseIsland
80 cmpq $(kShort),%rdx // long enough to bother with SSE?
81 ja LNotShort // yes
82
83 // Handle short forward copies. As the most common case, this is the fall-through path.
84 // rdx = length (<= kShort)
85 // rsi = source ptr
86 // rdi = dest ptr
87
88 LShort:
89 movl %edx,%ecx // copy length using 32-bit operation
90 shrl $2,%ecx // get #doublewords
91 jz LLeftovers
92 2: // loop copying doublewords
93 movl (%rsi),%eax
94 addq $4,%rsi
95 movl %eax,(%rdi)
96 addq $4,%rdi
97 decl %ecx
98 jnz 2b
99 LLeftovers: // handle leftover bytes (0..3) in last word
100 andl $3,%edx // any leftover bytes?
101 jz 5f
102 4: // loop copying bytes
103 movb (%rsi),%al
104 incq %rsi
105 movb %al,(%rdi)
106 incq %rdi
107 decl %edx
108 jnz 4b
109 5:
110 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
111 popq %rbp
112 ret
113
114
115 LReverseIsland: // keep the "jb" above a short branch...
116 jmp LReverse // ...because reverse moves are uncommon
117
118
119 // Handle forward moves that are long enough to justify use of SSE.
120 // First, 16-byte align the destination.
121 // rdx = length (> kShort)
122 // rsi = source ptr
123 // rdi = dest ptr
124
125 LNotShort:
126 cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops?
127 jae LVeryLong // use very-long-operand path
128 movl %edi,%ecx // copy low half of destination ptr
129 negl %ecx
130 andl $15,%ecx // get #bytes to align destination
131 jz LDestAligned // already aligned
132 subl %ecx,%edx // decrement length
133 rep // align destination
134 movsb
135
136
137 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
138 // based on the alignment of the source. All vector loads and stores are aligned.
139 // Even though this means we have to shift and repack vectors, doing so is much faster
140 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
141 // there is at least one chunk. When we enter the copy loops, the following registers
142 // are set up:
143 // rdx = residual length (0..63)
144 // rcx = -(length to move), a multiple of 64 less than 2GB
145 // rsi = ptr to 1st source byte not to move (unaligned)
146 // rdi = ptr to 1st dest byte not to move (aligned)
147
148 LDestAligned:
149 movl %edx,%ecx // copy length
150 movl %esi,%eax // copy low half of source address
151 andl $63,%edx // get remaining bytes for LShort
152 andl $15,%eax // mask to low 4 bits of source address
153 andl $-64,%ecx // get number of bytes we will copy in inner loop
154 // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block.
155 // lea LTable(%rip),%r8 // point to dispatch table
156 movq $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528
157 addq $(LTable-LZero),%r8 // work around 4586528
158 addq %rcx,%rsi // point to 1st byte not copied
159 addq %rcx,%rdi
160 movl (%r8,%rax,4),%eax // get offset of routine
161 negq %rcx // now generate offset to 1st byte to be copied
162 addq %r8,%rax // generate address of copy loop
163 jmp *%rax // enter copy loop, selected by source alignment
164
165 .align 2
166 LTable: // table of copy loop addresses
167 .long (LMod0 - LTable)
168 .long (LMod1 - LTable)
169 .long (LMod2 - LTable)
170 .long (LMod3 - LTable)
171 .long (LMod4 - LTable)
172 .long (LMod5 - LTable)
173 .long (LMod6 - LTable)
174 .long (LMod7 - LTable)
175 .long (LMod8 - LTable)
176 .long (LMod9 - LTable)
177 .long (LMod10 - LTable)
178 .long (LMod11 - LTable)
179 .long (LMod12 - LTable)
180 .long (LMod13 - LTable)
181 .long (LMod14 - LTable)
182 .long (LMod15 - LTable)
183
184
185 // Very long forward moves. These are at least several pages. They are special cased
186 // and aggressively optimized, not so much because they are common or useful, but
187 // because they are subject to benchmark. There isn't enough room for them in the
188 // area reserved on the commpage for bcopy, so we put them elsewhere. We call
189 // the longcopy routine using the normal ABI:
190 // rdi = dest
191 // rsi = source
192 // rdx = length (>= kVeryLong bytes)
193
194 LVeryLong:
195 pushq %r11 // save return value
196 movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
197 call *%rax // call very long operand routine
198 popq %rax // pop return value
199 popq %rbp
200 ret
201
202
203 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
204 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
205 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
206 // avoids having to read destination cache lines that will be completely overwritten.
207 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
208 // we do not know if the destination is in cache or not.
209
210 Lfastpath:
211 addq %rcx,%rsi // restore ptrs to 1st byte of source and dest
212 addq %rcx,%rdi
213 negl %ecx // make length positive (known to be < 2GB)
214 orl %edx,%ecx // restore total #bytes remaining to move
215 cld // we'll move forward
216 shrl $2,%ecx // compute #words to move
217 rep // the u-code will optimize this
218 movsl
219 jmp LLeftovers // handle 0..3 leftover bytes
220
221
222 // Forward loop for medium length operands in which low four bits of %rsi == 0000
223
224 LMod0:
225 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
226 jle Lfastpath // long enough for fastpath in microcode
227 jmp 1f
228 .align 4,0x90 // 16-byte align inner loops
229 1: // loop over 64-byte chunks
230 movdqa (%rsi,%rcx),%xmm0
231 movdqa 16(%rsi,%rcx),%xmm1
232 movdqa 32(%rsi,%rcx),%xmm2
233 movdqa 48(%rsi,%rcx),%xmm3
234
235 movdqa %xmm0,(%rdi,%rcx)
236 movdqa %xmm1,16(%rdi,%rcx)
237 movdqa %xmm2,32(%rdi,%rcx)
238 movdqa %xmm3,48(%rdi,%rcx)
239
240 addq $64,%rcx
241 jnz 1b
242
243 jmp LShort // copy remaining 0..63 bytes and done
244
245
246 // Forward loop for medium length operands in which low four bits of %rsi == 0001
247
248 LMod1:
249 movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword
250 1: // loop over 64-byte chunks
251 movdqa 15(%rsi,%rcx),%xmm1
252 movdqa 31(%rsi,%rcx),%xmm2
253 movdqa 47(%rsi,%rcx),%xmm3
254 movdqa 63(%rsi,%rcx),%xmm4
255
256 movdqa %xmm0,%xmm5
257 movdqa %xmm4,%xmm0
258
259 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
260 palignr $1,%xmm2,%xmm3
261 palignr $1,%xmm1,%xmm2
262 palignr $1,%xmm5,%xmm1
263
264 movdqa %xmm1,(%rdi,%rcx)
265 movdqa %xmm2,16(%rdi,%rcx)
266 movdqa %xmm3,32(%rdi,%rcx)
267 movdqa %xmm4,48(%rdi,%rcx)
268
269 addq $64,%rcx
270 jnz 1b
271
272 jmp LShort // copy remaining 0..63 bytes and done
273
274
275 // Forward loop for medium length operands in which low four bits of %rsi == 0010
276
277 LMod2:
278 movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
279 1: // loop over 64-byte chunks
280 movdqa 14(%rsi,%rcx),%xmm1
281 movdqa 30(%rsi,%rcx),%xmm2
282 movdqa 46(%rsi,%rcx),%xmm3
283 movdqa 62(%rsi,%rcx),%xmm4
284
285 movdqa %xmm0,%xmm5
286 movdqa %xmm4,%xmm0
287
288 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
289 palignr $2,%xmm2,%xmm3
290 palignr $2,%xmm1,%xmm2
291 palignr $2,%xmm5,%xmm1
292
293 movdqa %xmm1,(%rdi,%rcx)
294 movdqa %xmm2,16(%rdi,%rcx)
295 movdqa %xmm3,32(%rdi,%rcx)
296 movdqa %xmm4,48(%rdi,%rcx)
297
298 addq $64,%rcx
299 jnz 1b
300
301 jmp LShort // copy remaining 0..63 bytes and done
302
303
304 // Forward loop for medium length operands in which low four bits of %rsi == 0011
305
306 LMod3:
307 movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
308 1: // loop over 64-byte chunks
309 movdqa 13(%rsi,%rcx),%xmm1
310 movdqa 29(%rsi,%rcx),%xmm2
311 movdqa 45(%rsi,%rcx),%xmm3
312 movdqa 61(%rsi,%rcx),%xmm4
313
314 movdqa %xmm0,%xmm5
315 movdqa %xmm4,%xmm0
316
317 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
318 palignr $3,%xmm2,%xmm3
319 palignr $3,%xmm1,%xmm2
320 palignr $3,%xmm5,%xmm1
321
322 movdqa %xmm1,(%rdi,%rcx)
323 movdqa %xmm2,16(%rdi,%rcx)
324 movdqa %xmm3,32(%rdi,%rcx)
325 movdqa %xmm4,48(%rdi,%rcx)
326
327 addq $64,%rcx
328 jnz 1b
329
330 jmp LShort // copy remaining 0..63 bytes and done
331
332
333 // Forward loop for medium length operands in which low four bits of %rsi == 0100
334 // We use the float single data type in order to use "movss" to merge vectors.
335
336 LMod4:
337 movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop
338 jmp 1f
339 .align 4,0x90
340 1: // loop over 64-byte chunks
341 movaps 12(%rsi,%rcx),%xmm1
342 movaps 28(%rsi,%rcx),%xmm2
343 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
344 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
345 movaps 44(%rsi,%rcx),%xmm3
346 movss %xmm2,%xmm1
347 pshufd $(0x39),%xmm1,%xmm1
348 movaps 60(%rsi,%rcx),%xmm4
349 movss %xmm3,%xmm2
350 pshufd $(0x39),%xmm2,%xmm2
351
352 movaps %xmm0,(%rdi,%rcx)
353 movss %xmm4,%xmm3
354 pshufd $(0x39),%xmm3,%xmm3
355 movaps %xmm1,16(%rdi,%rcx)
356 movaps %xmm2,32(%rdi,%rcx)
357 movaps %xmm4,%xmm0
358 movaps %xmm3,48(%rdi,%rcx)
359
360 addq $64,%rcx
361 jnz 1b
362
363 jmp LShort // copy remaining 0..63 bytes and done
364
365
366 // Forward loop for medium length operands in which low four bits of %rsi == 0101
367
368 LMod5:
369 movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
370 1: // loop over 64-byte chunks
371 movdqa 11(%rsi,%rcx),%xmm1
372 movdqa 27(%rsi,%rcx),%xmm2
373 movdqa 43(%rsi,%rcx),%xmm3
374 movdqa 59(%rsi,%rcx),%xmm4
375
376 movdqa %xmm0,%xmm5
377 movdqa %xmm4,%xmm0
378
379 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
380 palignr $5,%xmm2,%xmm3
381 palignr $5,%xmm1,%xmm2
382 palignr $5,%xmm5,%xmm1
383
384 movdqa %xmm1,(%rdi,%rcx)
385 movdqa %xmm2,16(%rdi,%rcx)
386 movdqa %xmm3,32(%rdi,%rcx)
387 movdqa %xmm4,48(%rdi,%rcx)
388
389 addq $64,%rcx
390 jnz 1b
391
392 jmp LShort // copy remaining 0..63 bytes and done
393
394
395 // Forward loop for medium length operands in which low four bits of %rsi == 0110
396
397 LMod6:
398 movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
399 1: // loop over 64-byte chunks
400 movdqa 10(%rsi,%rcx),%xmm1
401 movdqa 26(%rsi,%rcx),%xmm2
402 movdqa 42(%rsi,%rcx),%xmm3
403 movdqa 58(%rsi,%rcx),%xmm4
404
405 movdqa %xmm0,%xmm5
406 movdqa %xmm4,%xmm0
407
408 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
409 palignr $6,%xmm2,%xmm3
410 palignr $6,%xmm1,%xmm2
411 palignr $6,%xmm5,%xmm1
412
413 movdqa %xmm1,(%rdi,%rcx)
414 movdqa %xmm2,16(%rdi,%rcx)
415 movdqa %xmm3,32(%rdi,%rcx)
416 movdqa %xmm4,48(%rdi,%rcx)
417
418 addq $64,%rcx
419 jnz 1b
420
421 jmp LShort // copy remaining 0..63 bytes and done
422
423
424 // Forward loop for medium length operands in which low four bits of %rsi == 0111
425
426 LMod7:
427 movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
428 1: // loop over 64-byte chunks
429 movdqa 9(%rsi,%rcx),%xmm1
430 movdqa 25(%rsi,%rcx),%xmm2
431 movdqa 41(%rsi,%rcx),%xmm3
432 movdqa 57(%rsi,%rcx),%xmm4
433
434 movdqa %xmm0,%xmm5
435 movdqa %xmm4,%xmm0
436
437 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
438 palignr $7,%xmm2,%xmm3
439 palignr $7,%xmm1,%xmm2
440 palignr $7,%xmm5,%xmm1
441
442 movdqa %xmm1,(%rdi,%rcx)
443 movdqa %xmm2,16(%rdi,%rcx)
444 movdqa %xmm3,32(%rdi,%rcx)
445 movdqa %xmm4,48(%rdi,%rcx)
446
447 addq $64,%rcx
448 jnz 1b
449
450 jmp LShort // copy remaining 0..63 bytes and done
451
452
453 // Forward loop for medium length operands in which low four bits of %rsi == 1000
454 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
455
456 LMod8:
457 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
458 jle Lfastpath // long enough for fastpath in microcode
459 movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop
460 jmp 1f
461 .align 4,0x90
462 1: // loop over 64-byte chunks
463 movapd 8(%rsi,%rcx),%xmm1
464 movapd 24(%rsi,%rcx),%xmm2
465 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
466 movapd 40(%rsi,%rcx),%xmm3
467 shufpd $01,%xmm2,%xmm1
468 movapd 56(%rsi,%rcx),%xmm4
469 shufpd $01,%xmm3,%xmm2
470
471 movapd %xmm0,(%rdi,%rcx)
472 shufpd $01,%xmm4,%xmm3
473 movapd %xmm1,16(%rdi,%rcx)
474 movapd %xmm2,32(%rdi,%rcx)
475 movapd %xmm4,%xmm0
476 movapd %xmm3,48(%rdi,%rcx)
477
478 addq $64,%rcx
479 jnz 1b
480
481 jmp LShort // copy remaining 0..63 bytes and done
482
483
484 // Forward loop for medium length operands in which low four bits of %rsi == 1001
485
486 LMod9:
487 movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
488 1: // loop over 64-byte chunks
489 movdqa 7(%rsi,%rcx),%xmm1
490 movdqa 23(%rsi,%rcx),%xmm2
491 movdqa 39(%rsi,%rcx),%xmm3
492 movdqa 55(%rsi,%rcx),%xmm4
493
494 movdqa %xmm0,%xmm5
495 movdqa %xmm4,%xmm0
496
497 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
498 palignr $9,%xmm2,%xmm3
499 palignr $9,%xmm1,%xmm2
500 palignr $9,%xmm5,%xmm1
501
502 movdqa %xmm1,(%rdi,%rcx)
503 movdqa %xmm2,16(%rdi,%rcx)
504 movdqa %xmm3,32(%rdi,%rcx)
505 movdqa %xmm4,48(%rdi,%rcx)
506
507 addq $64,%rcx
508 jnz 1b
509
510 jmp LShort // copy remaining 0..63 bytes and done
511
512
513 // Forward loop for medium length operands in which low four bits of %rsi == 1010
514
515 LMod10:
516 movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
517 1: // loop over 64-byte chunks
518 movdqa 6(%rsi,%rcx),%xmm1
519 movdqa 22(%rsi,%rcx),%xmm2
520 movdqa 38(%rsi,%rcx),%xmm3
521 movdqa 54(%rsi,%rcx),%xmm4
522
523 movdqa %xmm0,%xmm5
524 movdqa %xmm4,%xmm0
525
526 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
527 palignr $10,%xmm2,%xmm3
528 palignr $10,%xmm1,%xmm2
529 palignr $10,%xmm5,%xmm1
530
531 movdqa %xmm1,(%rdi,%rcx)
532 movdqa %xmm2,16(%rdi,%rcx)
533 movdqa %xmm3,32(%rdi,%rcx)
534 movdqa %xmm4,48(%rdi,%rcx)
535
536 addq $64,%rcx
537 jnz 1b
538
539 jmp LShort // copy remaining 0..63 bytes and done
540
541
542 // Forward loop for medium length operands in which low four bits of %rsi == 1011
543
544 LMod11:
545 movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
546 1: // loop over 64-byte chunks
547 movdqa 5(%rsi,%rcx),%xmm1
548 movdqa 21(%rsi,%rcx),%xmm2
549 movdqa 37(%rsi,%rcx),%xmm3
550 movdqa 53(%rsi,%rcx),%xmm4
551
552 movdqa %xmm0,%xmm5
553 movdqa %xmm4,%xmm0
554
555 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
556 palignr $11,%xmm2,%xmm3
557 palignr $11,%xmm1,%xmm2
558 palignr $11,%xmm5,%xmm1
559
560 movdqa %xmm1,(%rdi,%rcx)
561 movdqa %xmm2,16(%rdi,%rcx)
562 movdqa %xmm3,32(%rdi,%rcx)
563 movdqa %xmm4,48(%rdi,%rcx)
564
565 addq $64,%rcx
566 jnz 1b
567
568 jmp LShort // copy remaining 0..63 bytes and done
569
570
571 // Forward loop for medium length operands in which low four bits of %rsi == 1100
572 // We use the float single data type in order to use "movss" to merge vectors.
573
574 LMod12:
575 movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified
576 jmp 1f
577 .align 4,0x90
578 1: // loop over 64-byte chunks
579 pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
580 pshufd $(0x93),20(%rsi,%rcx),%xmm2
581 pshufd $(0x93),36(%rsi,%rcx),%xmm3
582 pshufd $(0x93),52(%rsi,%rcx),%xmm4
583
584 movaps %xmm4,%xmm5
585 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
586 movss %xmm2,%xmm3
587 movss %xmm1,%xmm2
588 movss %xmm0,%xmm1
589
590 movaps %xmm1,(%rdi,%rcx)
591 movaps %xmm2,16(%rdi,%rcx)
592 movaps %xmm5,%xmm0
593 movaps %xmm3,32(%rdi,%rcx)
594 movaps %xmm4,48(%rdi,%rcx)
595
596 addq $64,%rcx
597 jnz 1b
598
599 jmp LShort // copy remaining 0..63 bytes and done
600
601
602 // Forward loop for medium length operands in which low four bits of %rsi == 1101
603
604 LMod13:
605 movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
606 1: // loop over 64-byte chunks
607 movdqa 3(%rsi,%rcx),%xmm1
608 movdqa 19(%rsi,%rcx),%xmm2
609 movdqa 35(%rsi,%rcx),%xmm3
610 movdqa 51(%rsi,%rcx),%xmm4
611
612 movdqa %xmm0,%xmm5
613 movdqa %xmm4,%xmm0
614
615 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
616 palignr $13,%xmm2,%xmm3
617 palignr $13,%xmm1,%xmm2
618 palignr $13,%xmm5,%xmm1
619
620 movdqa %xmm1,(%rdi,%rcx)
621 movdqa %xmm2,16(%rdi,%rcx)
622 movdqa %xmm3,32(%rdi,%rcx)
623 movdqa %xmm4,48(%rdi,%rcx)
624
625 addq $64,%rcx
626 jnz 1b
627
628 jmp LShort // copy remaining 0..63 bytes and done
629
630
631 // Forward loop for medium length operands in which low four bits of %rsi == 1110
632
633 LMod14:
634 movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
635 1: // loop over 64-byte chunks
636 movdqa 2(%rsi,%rcx),%xmm1
637 movdqa 18(%rsi,%rcx),%xmm2
638 movdqa 34(%rsi,%rcx),%xmm3
639 movdqa 50(%rsi,%rcx),%xmm4
640
641 movdqa %xmm0,%xmm5
642 movdqa %xmm4,%xmm0
643
644 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
645 palignr $14,%xmm2,%xmm3
646 palignr $14,%xmm1,%xmm2
647 palignr $14,%xmm5,%xmm1
648
649 movdqa %xmm1,(%rdi,%rcx)
650 movdqa %xmm2,16(%rdi,%rcx)
651 movdqa %xmm3,32(%rdi,%rcx)
652 movdqa %xmm4,48(%rdi,%rcx)
653
654 addq $64,%rcx
655 jnz 1b
656
657 jmp LShort // copy remaining 0..63 bytes and done
658
659
660 // Forward loop for medium length operands in which low four bits of %rsi == 1111
661
662 LMod15:
663 movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
664 1: // loop over 64-byte chunks
665 movdqa 1(%rsi,%rcx),%xmm1
666 movdqa 17(%rsi,%rcx),%xmm2
667 movdqa 33(%rsi,%rcx),%xmm3
668 movdqa 49(%rsi,%rcx),%xmm4
669
670 movdqa %xmm0,%xmm5
671 movdqa %xmm4,%xmm0
672
673 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
674 palignr $15,%xmm2,%xmm3
675 palignr $15,%xmm1,%xmm2
676 palignr $15,%xmm5,%xmm1
677
678 movdqa %xmm1,(%rdi,%rcx)
679 movdqa %xmm2,16(%rdi,%rcx)
680 movdqa %xmm3,32(%rdi,%rcx)
681 movdqa %xmm4,48(%rdi,%rcx)
682
683 addq $64,%rcx
684 jnz 1b
685
686 jmp LShort // copy remaining 0..63 bytes and done
687
688
689 // Reverse moves. These are not optimized as aggressively as their forward
690 // counterparts, as they are only used with destructive overlap.
691 // rdx = length
692 // rsi = source ptr
693 // rdi = dest ptr
694
695 LReverse:
696 addq %rdx,%rsi // point to end of strings
697 addq %rdx,%rdi
698 cmpq $(kShort),%rdx // long enough to bother with SSE?
699 ja LReverseNotShort // yes
700
701 // Handle reverse short copies.
702 // edx = length (<= kShort)
703 // rsi = one byte past end of source
704 // rdi = one byte past end of dest
705
706 LReverseShort:
707 movl %edx,%ecx // copy length
708 shrl $3,%ecx // #quadwords
709 jz 3f
710 1:
711 subq $8,%rsi
712 movq (%rsi),%rax
713 subq $8,%rdi
714 movq %rax,(%rdi)
715 decl %ecx
716 jnz 1b
717 3:
718 andl $7,%edx // bytes?
719 jz 5f
720 4:
721 decq %rsi
722 movb (%rsi),%al
723 decq %rdi
724 movb %al,(%rdi)
725 decl %edx
726 jnz 4b
727 5:
728 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
729 popq %rbp
730 ret
731
732 // Handle a reverse move long enough to justify using SSE.
733 // rdx = length (> kShort)
734 // rsi = one byte past end of source
735 // rdi = one byte past end of dest
736
737 LReverseNotShort:
738 movl %edi,%ecx // copy destination
739 andl $15,%ecx // get #bytes to align destination
740 je LReverseDestAligned // already aligned
741 subq %rcx,%rdx // adjust length
742 1: // loop copying 1..15 bytes
743 decq %rsi
744 movb (%rsi),%al
745 decq %rdi
746 movb %al,(%rdi)
747 decl %ecx
748 jnz 1b
749
750 // Destination is now aligned. Prepare for reverse loops.
751
752 LReverseDestAligned:
753 movq %rdx,%rcx // copy length
754 andl $63,%edx // get remaining bytes for LReverseShort
755 andq $-64,%rcx // get number of bytes we will copy in inner loop
756 subq %rcx,%rsi // point to endpoint of copy
757 subq %rcx,%rdi
758 testl $15,%esi // is source aligned too?
759 jnz LReverseUnalignedLoop // no
760
761 LReverseAlignedLoop: // loop over 64-byte chunks
762 movdqa -16(%rsi,%rcx),%xmm0
763 movdqa -32(%rsi,%rcx),%xmm1
764 movdqa -48(%rsi,%rcx),%xmm2
765 movdqa -64(%rsi,%rcx),%xmm3
766
767 movdqa %xmm0,-16(%rdi,%rcx)
768 movdqa %xmm1,-32(%rdi,%rcx)
769 movdqa %xmm2,-48(%rdi,%rcx)
770 movdqa %xmm3,-64(%rdi,%rcx)
771
772 subq $64,%rcx
773 jne LReverseAlignedLoop
774
775 jmp LReverseShort // copy remaining 0..63 bytes and done
776
777
778 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
779
780 LReverseUnalignedLoop: // loop over 64-byte chunks
781 movdqu -16(%rsi,%rcx),%xmm0
782 movdqu -32(%rsi,%rcx),%xmm1
783 movdqu -48(%rsi,%rcx),%xmm2
784 movdqu -64(%rsi,%rcx),%xmm3
785
786 movdqa %xmm0,-16(%rdi,%rcx)
787 movdqa %xmm1,-32(%rdi,%rcx)
788 movdqa %xmm2,-48(%rdi,%rcx)
789 movdqa %xmm3,-64(%rdi,%rcx)
790
791 subq $64,%rcx
792 jne LReverseUnalignedLoop
793
794 jmp LReverseShort // copy remaining 0..63 bytes and done
795
796
797 COMMPAGE_DESCRIPTOR(bcopy_sse4_64,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)