]> git.saurik.com Git - apple/libc.git/blame - x86_64/string/bcopy_sse3x.s
Libc-825.26.tar.gz
[apple/libc.git] / x86_64 / string / bcopy_sse3x.s
CommitLineData
1f2f436a
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
ad3c9f2a 30#include "platfunc.h"
1f2f436a
A
31
32/*
33 * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
34 * Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version.
35 *
36 * The following #defines are tightly coupled to the u-architecture:
37 */
38
39#define kShort 80 // too short to bother with SSE (must be >=80)
40#define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB)
41#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
42
43// void bcopy(const void *src, void *dst, size_t len);
44
45PLATFUNC_FUNCTION_START_GENERIC(bcopy, sse3x, 64, 5)
46LZero:
47 pushq %rbp // set up a frame for backtraces
48 movq %rsp,%rbp
49 movq %rsi,%rax // copy dest ptr
50 movq %rdi,%rsi // xchange source and dest ptrs
51 movq %rax,%rdi
52 subq %rsi,%rax // (dest - source)
53 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
54 jb LReverseIsland
55 cmpq $(kShort),%rdx // long enough to bother with SSE?
56 jbe LShort // no
57 jmp LNotShort
58
59//
60// void *memcpy(void *dst, const void *src, size_t len);
61// void *memmove(void *dst, const void *src, size_t len);
62//
63
64PLATFUNC_FUNCTION_START_GENERIC(memcpy, sse3x, 64, 0) // void *memcpy(void *dst, const void *src, size_t len)
65PLATFUNC_FUNCTION_START_GENERIC(memmove, sse3x, 64, 0) // void *memmove(void *dst, const void *src, size_t len)
66 pushq %rbp // set up a frame for backtraces
67 movq %rsp,%rbp
68 movq %rdi,%r11 // save return value here
69 movq %rdi,%rax
70 subq %rsi,%rax // (dest - source)
71 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
72 jb LReverseIsland
73 cmpq $(kShort),%rdx // long enough to bother with SSE?
74 ja LNotShort // yes
75
76// Handle short forward copies. As the most common case, this is the fall-through path.
77// rdx = length (<= kShort)
78// rsi = source ptr
79// rdi = dest ptr
80
81LShort:
82 movl %edx,%ecx // copy length using 32-bit operation
83 shrl $2,%ecx // get #doublewords
84 jz LLeftovers
852: // loop copying doublewords
86 movl (%rsi),%eax
87 addq $4,%rsi
88 movl %eax,(%rdi)
89 addq $4,%rdi
90 decl %ecx
91 jnz 2b
92LLeftovers: // handle leftover bytes (0..3) in last word
93 andl $3,%edx // any leftover bytes?
94 jz 5f
954: // loop copying bytes
96 movb (%rsi),%al
97 incq %rsi
98 movb %al,(%rdi)
99 incq %rdi
100 decl %edx
101 jnz 4b
1025:
103 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
104 popq %rbp
105 ret
106
107
108LReverseIsland: // keep the "jb" above a short branch...
109 jmp LReverse // ...because reverse moves are uncommon
110
111
112// Handle forward moves that are long enough to justify use of SSE.
113// First, 16-byte align the destination.
114// rdx = length (> kShort)
115// rsi = source ptr
116// rdi = dest ptr
117
118LNotShort:
119 cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops?
120 jae LVeryLong // use very-long-operand path
121 movl %edi,%ecx // copy low half of destination ptr
122 negl %ecx
123 andl $15,%ecx // get #bytes to align destination
124 jz LDestAligned // already aligned
125 subl %ecx,%edx // decrement length
126 rep // align destination
127 movsb
128
129
130// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
131// based on the alignment of the source. All vector loads and stores are aligned.
132// Even though this means we have to shift and repack vectors, doing so is much faster
133// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
134// there is at least one chunk. When we enter the copy loops, the following registers
135// are set up:
136// rdx = residual length (0..63)
137// rcx = -(length to move), a multiple of 64 less than 2GB
138// rsi = ptr to 1st source byte not to move (unaligned)
139// rdi = ptr to 1st dest byte not to move (aligned)
140
141LDestAligned:
142 movq %rdx,%rcx // copy length
143 movl %esi,%eax // copy low half of source address
144 andl $63,%edx // get remaining bytes for LShort
145 andl $15,%eax // mask to low 4 bits of source address
146 andq $-64,%rcx // get number of bytes we will copy in inner loop
147 leaq LTable(%rip), %r8
148 addq %rcx,%rsi // point to 1st byte not copied
149 addq %rcx,%rdi
150 movl (%r8,%rax,4),%eax // get offset of routine
151 negq %rcx // now generate offset to 1st byte to be copied
152 addq %r8,%rax // generate address of copy loop
153 jmp *%rax // enter copy loop, selected by source alignment
154
155 .align 2
156LTable: // table of copy loop addresses
157// force generation of assembly-time constants. Otherwise assembler
158// creates subtractor relocations relative to first external symbol,
159// and this file has none
160 .set LMod0Offset, LMod0 - LTable
161 .set LMod1Offset, LMod1 - LTable
162 .set LMod2Offset, LMod2 - LTable
163 .set LMod3Offset, LMod3 - LTable
164 .set LMod4Offset, LMod4 - LTable
165 .set LMod5Offset, LMod5 - LTable
166 .set LMod6Offset, LMod6 - LTable
167 .set LMod7Offset, LMod7 - LTable
168 .set LMod8Offset, LMod8 - LTable
169 .set LMod9Offset, LMod9 - LTable
170 .set LMod10Offset, LMod10 - LTable
171 .set LMod11Offset, LMod11 - LTable
172 .set LMod12Offset, LMod12 - LTable
173 .set LMod13Offset, LMod13 - LTable
174 .set LMod14Offset, LMod14 - LTable
175 .set LMod15Offset, LMod15 - LTable
176 .long LMod0Offset
177 .long LMod1Offset
178 .long LMod2Offset
179 .long LMod3Offset
180 .long LMod4Offset
181 .long LMod5Offset
182 .long LMod6Offset
183 .long LMod7Offset
184 .long LMod8Offset
185 .long LMod9Offset
186 .long LMod10Offset
187 .long LMod11Offset
188 .long LMod12Offset
189 .long LMod13Offset
190 .long LMod14Offset
191 .long LMod15Offset
192
193
194// Very long forward moves. These are at least several pages. They are special cased
195// and aggressively optimized, not so much because they are common or useful, but
196// because they are subject to benchmark. There isn't enough room for them in the
197// area reserved on the platfunc for bcopy, so we put them elsewhere. We call
198// the longcopy routine using the normal ABI:
199// rdi = dest
200// rsi = source
201// rdx = length (>= kVeryLong bytes)
202
203LVeryLong:
204 pushq %r11 // save return value
205 call _longcopy // call very long operand routine
206 popq %rax // pop return value
207 popq %rbp
208 ret
209
210
211// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
212// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
213// about 256 bytes up to kVeryLong for cold caches. This is because the microcode
214// avoids having to read destination cache lines that will be completely overwritten.
215// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
216// we do not know if the destination is in cache or not.
217
218Lfastpath:
219 addq %rcx,%rsi // restore ptrs to 1st byte of source and dest
220 addq %rcx,%rdi
221 negl %ecx // make length positive (known to be < 2GB)
222 orl %edx,%ecx // restore total #bytes remaining to move
223 cld // we'll move forward
224 shrl $2,%ecx // compute #words to move
225 rep // the u-code will optimize this
226 movsl
227 jmp LLeftovers // handle 0..3 leftover bytes
228
229
230// Forward loop for medium length operands in which low four bits of %rsi == 0000
231
232LMod0:
233 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
234 jle Lfastpath // long enough for fastpath in microcode
235 jmp 1f
236 .align 4,0x90 // 16-byte align inner loops
2371: // loop over 64-byte chunks
238 movdqa (%rsi,%rcx),%xmm0
239 movdqa 16(%rsi,%rcx),%xmm1
240 movdqa 32(%rsi,%rcx),%xmm2
241 movdqa 48(%rsi,%rcx),%xmm3
242
243 movdqa %xmm0,(%rdi,%rcx)
244 movdqa %xmm1,16(%rdi,%rcx)
245 movdqa %xmm2,32(%rdi,%rcx)
246 movdqa %xmm3,48(%rdi,%rcx)
247
248 addq $64,%rcx
249 jnz 1b
250
251 jmp LShort // copy remaining 0..63 bytes and done
252
253
254// Forward loop for medium length operands in which low four bits of %rsi == 0001
255
256LMod1:
257 movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword
2581: // loop over 64-byte chunks
259 movdqa 15(%rsi,%rcx),%xmm1
260 movdqa 31(%rsi,%rcx),%xmm2
261 movdqa 47(%rsi,%rcx),%xmm3
262 movdqa 63(%rsi,%rcx),%xmm4
263
264 movdqa %xmm0,%xmm5
265 movdqa %xmm4,%xmm0
266
267 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
268 palignr $1,%xmm2,%xmm3
269 palignr $1,%xmm1,%xmm2
270 palignr $1,%xmm5,%xmm1
271
272 movdqa %xmm1,(%rdi,%rcx)
273 movdqa %xmm2,16(%rdi,%rcx)
274 movdqa %xmm3,32(%rdi,%rcx)
275 movdqa %xmm4,48(%rdi,%rcx)
276
277 addq $64,%rcx
278 jnz 1b
279
280 jmp LShort // copy remaining 0..63 bytes and done
281
282
283// Forward loop for medium length operands in which low four bits of %rsi == 0010
284
285LMod2:
286 movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
2871: // loop over 64-byte chunks
288 movdqa 14(%rsi,%rcx),%xmm1
289 movdqa 30(%rsi,%rcx),%xmm2
290 movdqa 46(%rsi,%rcx),%xmm3
291 movdqa 62(%rsi,%rcx),%xmm4
292
293 movdqa %xmm0,%xmm5
294 movdqa %xmm4,%xmm0
295
296 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
297 palignr $2,%xmm2,%xmm3
298 palignr $2,%xmm1,%xmm2
299 palignr $2,%xmm5,%xmm1
300
301 movdqa %xmm1,(%rdi,%rcx)
302 movdqa %xmm2,16(%rdi,%rcx)
303 movdqa %xmm3,32(%rdi,%rcx)
304 movdqa %xmm4,48(%rdi,%rcx)
305
306 addq $64,%rcx
307 jnz 1b
308
309 jmp LShort // copy remaining 0..63 bytes and done
310
311
312// Forward loop for medium length operands in which low four bits of %rsi == 0011
313
314LMod3:
315 movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
3161: // loop over 64-byte chunks
317 movdqa 13(%rsi,%rcx),%xmm1
318 movdqa 29(%rsi,%rcx),%xmm2
319 movdqa 45(%rsi,%rcx),%xmm3
320 movdqa 61(%rsi,%rcx),%xmm4
321
322 movdqa %xmm0,%xmm5
323 movdqa %xmm4,%xmm0
324
325 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
326 palignr $3,%xmm2,%xmm3
327 palignr $3,%xmm1,%xmm2
328 palignr $3,%xmm5,%xmm1
329
330 movdqa %xmm1,(%rdi,%rcx)
331 movdqa %xmm2,16(%rdi,%rcx)
332 movdqa %xmm3,32(%rdi,%rcx)
333 movdqa %xmm4,48(%rdi,%rcx)
334
335 addq $64,%rcx
336 jnz 1b
337
338 jmp LShort // copy remaining 0..63 bytes and done
339
340
341// Forward loop for medium length operands in which low four bits of %rsi == 0100
342// We use the float single data type in order to use "movss" to merge vectors.
343
344LMod4:
345 movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop
346 jmp 1f
347 .align 4,0x90
3481: // loop over 64-byte chunks
349 movaps 12(%rsi,%rcx),%xmm1
350 movaps 28(%rsi,%rcx),%xmm2
351 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
352 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
353 movaps 44(%rsi,%rcx),%xmm3
354 movss %xmm2,%xmm1
355 pshufd $(0x39),%xmm1,%xmm1
356 movaps 60(%rsi,%rcx),%xmm4
357 movss %xmm3,%xmm2
358 pshufd $(0x39),%xmm2,%xmm2
359
360 movaps %xmm0,(%rdi,%rcx)
361 movss %xmm4,%xmm3
362 pshufd $(0x39),%xmm3,%xmm3
363 movaps %xmm1,16(%rdi,%rcx)
364 movaps %xmm2,32(%rdi,%rcx)
365 movaps %xmm4,%xmm0
366 movaps %xmm3,48(%rdi,%rcx)
367
368 addq $64,%rcx
369 jnz 1b
370
371 jmp LShort // copy remaining 0..63 bytes and done
372
373
374// Forward loop for medium length operands in which low four bits of %rsi == 0101
375
376LMod5:
377 movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
3781: // loop over 64-byte chunks
379 movdqa 11(%rsi,%rcx),%xmm1
380 movdqa 27(%rsi,%rcx),%xmm2
381 movdqa 43(%rsi,%rcx),%xmm3
382 movdqa 59(%rsi,%rcx),%xmm4
383
384 movdqa %xmm0,%xmm5
385 movdqa %xmm4,%xmm0
386
387 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
388 palignr $5,%xmm2,%xmm3
389 palignr $5,%xmm1,%xmm2
390 palignr $5,%xmm5,%xmm1
391
392 movdqa %xmm1,(%rdi,%rcx)
393 movdqa %xmm2,16(%rdi,%rcx)
394 movdqa %xmm3,32(%rdi,%rcx)
395 movdqa %xmm4,48(%rdi,%rcx)
396
397 addq $64,%rcx
398 jnz 1b
399
400 jmp LShort // copy remaining 0..63 bytes and done
401
402
403// Forward loop for medium length operands in which low four bits of %rsi == 0110
404
405LMod6:
406 movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
4071: // loop over 64-byte chunks
408 movdqa 10(%rsi,%rcx),%xmm1
409 movdqa 26(%rsi,%rcx),%xmm2
410 movdqa 42(%rsi,%rcx),%xmm3
411 movdqa 58(%rsi,%rcx),%xmm4
412
413 movdqa %xmm0,%xmm5
414 movdqa %xmm4,%xmm0
415
416 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
417 palignr $6,%xmm2,%xmm3
418 palignr $6,%xmm1,%xmm2
419 palignr $6,%xmm5,%xmm1
420
421 movdqa %xmm1,(%rdi,%rcx)
422 movdqa %xmm2,16(%rdi,%rcx)
423 movdqa %xmm3,32(%rdi,%rcx)
424 movdqa %xmm4,48(%rdi,%rcx)
425
426 addq $64,%rcx
427 jnz 1b
428
429 jmp LShort // copy remaining 0..63 bytes and done
430
431
432// Forward loop for medium length operands in which low four bits of %rsi == 0111
433
434LMod7:
435 movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
4361: // loop over 64-byte chunks
437 movdqa 9(%rsi,%rcx),%xmm1
438 movdqa 25(%rsi,%rcx),%xmm2
439 movdqa 41(%rsi,%rcx),%xmm3
440 movdqa 57(%rsi,%rcx),%xmm4
441
442 movdqa %xmm0,%xmm5
443 movdqa %xmm4,%xmm0
444
445 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
446 palignr $7,%xmm2,%xmm3
447 palignr $7,%xmm1,%xmm2
448 palignr $7,%xmm5,%xmm1
449
450 movdqa %xmm1,(%rdi,%rcx)
451 movdqa %xmm2,16(%rdi,%rcx)
452 movdqa %xmm3,32(%rdi,%rcx)
453 movdqa %xmm4,48(%rdi,%rcx)
454
455 addq $64,%rcx
456 jnz 1b
457
458 jmp LShort // copy remaining 0..63 bytes and done
459
460
461// Forward loop for medium length operands in which low four bits of %rsi == 1000
462// We use the float double data type in order to use "shufpd" to shift by 8 bytes.
463
464LMod8:
465 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
466 jle Lfastpath // long enough for fastpath in microcode
467 movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop
468 jmp 1f
469 .align 4,0x90
4701: // loop over 64-byte chunks
471 movapd 8(%rsi,%rcx),%xmm1
472 movapd 24(%rsi,%rcx),%xmm2
473 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
474 movapd 40(%rsi,%rcx),%xmm3
475 shufpd $01,%xmm2,%xmm1
476 movapd 56(%rsi,%rcx),%xmm4
477 shufpd $01,%xmm3,%xmm2
478
479 movapd %xmm0,(%rdi,%rcx)
480 shufpd $01,%xmm4,%xmm3
481 movapd %xmm1,16(%rdi,%rcx)
482 movapd %xmm2,32(%rdi,%rcx)
483 movapd %xmm4,%xmm0
484 movapd %xmm3,48(%rdi,%rcx)
485
486 addq $64,%rcx
487 jnz 1b
488
489 jmp LShort // copy remaining 0..63 bytes and done
490
491
492// Forward loop for medium length operands in which low four bits of %rsi == 1001
493
494LMod9:
495 movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
4961: // loop over 64-byte chunks
497 movdqa 7(%rsi,%rcx),%xmm1
498 movdqa 23(%rsi,%rcx),%xmm2
499 movdqa 39(%rsi,%rcx),%xmm3
500 movdqa 55(%rsi,%rcx),%xmm4
501
502 movdqa %xmm0,%xmm5
503 movdqa %xmm4,%xmm0
504
505 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
506 palignr $9,%xmm2,%xmm3
507 palignr $9,%xmm1,%xmm2
508 palignr $9,%xmm5,%xmm1
509
510 movdqa %xmm1,(%rdi,%rcx)
511 movdqa %xmm2,16(%rdi,%rcx)
512 movdqa %xmm3,32(%rdi,%rcx)
513 movdqa %xmm4,48(%rdi,%rcx)
514
515 addq $64,%rcx
516 jnz 1b
517
518 jmp LShort // copy remaining 0..63 bytes and done
519
520
521// Forward loop for medium length operands in which low four bits of %rsi == 1010
522
523LMod10:
524 movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
5251: // loop over 64-byte chunks
526 movdqa 6(%rsi,%rcx),%xmm1
527 movdqa 22(%rsi,%rcx),%xmm2
528 movdqa 38(%rsi,%rcx),%xmm3
529 movdqa 54(%rsi,%rcx),%xmm4
530
531 movdqa %xmm0,%xmm5
532 movdqa %xmm4,%xmm0
533
534 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
535 palignr $10,%xmm2,%xmm3
536 palignr $10,%xmm1,%xmm2
537 palignr $10,%xmm5,%xmm1
538
539 movdqa %xmm1,(%rdi,%rcx)
540 movdqa %xmm2,16(%rdi,%rcx)
541 movdqa %xmm3,32(%rdi,%rcx)
542 movdqa %xmm4,48(%rdi,%rcx)
543
544 addq $64,%rcx
545 jnz 1b
546
547 jmp LShort // copy remaining 0..63 bytes and done
548
549
550// Forward loop for medium length operands in which low four bits of %rsi == 1011
551
552LMod11:
553 movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
5541: // loop over 64-byte chunks
555 movdqa 5(%rsi,%rcx),%xmm1
556 movdqa 21(%rsi,%rcx),%xmm2
557 movdqa 37(%rsi,%rcx),%xmm3
558 movdqa 53(%rsi,%rcx),%xmm4
559
560 movdqa %xmm0,%xmm5
561 movdqa %xmm4,%xmm0
562
563 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
564 palignr $11,%xmm2,%xmm3
565 palignr $11,%xmm1,%xmm2
566 palignr $11,%xmm5,%xmm1
567
568 movdqa %xmm1,(%rdi,%rcx)
569 movdqa %xmm2,16(%rdi,%rcx)
570 movdqa %xmm3,32(%rdi,%rcx)
571 movdqa %xmm4,48(%rdi,%rcx)
572
573 addq $64,%rcx
574 jnz 1b
575
576 jmp LShort // copy remaining 0..63 bytes and done
577
578
579// Forward loop for medium length operands in which low four bits of %rsi == 1100
580// We use the float single data type in order to use "movss" to merge vectors.
581
582LMod12:
583 movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified
584 jmp 1f
585 .align 4,0x90
5861: // loop over 64-byte chunks
587 pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
588 pshufd $(0x93),20(%rsi,%rcx),%xmm2
589 pshufd $(0x93),36(%rsi,%rcx),%xmm3
590 pshufd $(0x93),52(%rsi,%rcx),%xmm4
591
592 movaps %xmm4,%xmm5
593 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
594 movss %xmm2,%xmm3
595 movss %xmm1,%xmm2
596 movss %xmm0,%xmm1
597
598 movaps %xmm1,(%rdi,%rcx)
599 movaps %xmm2,16(%rdi,%rcx)
600 movaps %xmm5,%xmm0
601 movaps %xmm3,32(%rdi,%rcx)
602 movaps %xmm4,48(%rdi,%rcx)
603
604 addq $64,%rcx
605 jnz 1b
606
607 jmp LShort // copy remaining 0..63 bytes and done
608
609
610// Forward loop for medium length operands in which low four bits of %rsi == 1101
611
612LMod13:
613 movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
6141: // loop over 64-byte chunks
615 movdqa 3(%rsi,%rcx),%xmm1
616 movdqa 19(%rsi,%rcx),%xmm2
617 movdqa 35(%rsi,%rcx),%xmm3
618 movdqa 51(%rsi,%rcx),%xmm4
619
620 movdqa %xmm0,%xmm5
621 movdqa %xmm4,%xmm0
622
623 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
624 palignr $13,%xmm2,%xmm3
625 palignr $13,%xmm1,%xmm2
626 palignr $13,%xmm5,%xmm1
627
628 movdqa %xmm1,(%rdi,%rcx)
629 movdqa %xmm2,16(%rdi,%rcx)
630 movdqa %xmm3,32(%rdi,%rcx)
631 movdqa %xmm4,48(%rdi,%rcx)
632
633 addq $64,%rcx
634 jnz 1b
635
636 jmp LShort // copy remaining 0..63 bytes and done
637
638
639// Forward loop for medium length operands in which low four bits of %rsi == 1110
640
641LMod14:
642 movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
6431: // loop over 64-byte chunks
644 movdqa 2(%rsi,%rcx),%xmm1
645 movdqa 18(%rsi,%rcx),%xmm2
646 movdqa 34(%rsi,%rcx),%xmm3
647 movdqa 50(%rsi,%rcx),%xmm4
648
649 movdqa %xmm0,%xmm5
650 movdqa %xmm4,%xmm0
651
652 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
653 palignr $14,%xmm2,%xmm3
654 palignr $14,%xmm1,%xmm2
655 palignr $14,%xmm5,%xmm1
656
657 movdqa %xmm1,(%rdi,%rcx)
658 movdqa %xmm2,16(%rdi,%rcx)
659 movdqa %xmm3,32(%rdi,%rcx)
660 movdqa %xmm4,48(%rdi,%rcx)
661
662 addq $64,%rcx
663 jnz 1b
664
665 jmp LShort // copy remaining 0..63 bytes and done
666
667
668// Forward loop for medium length operands in which low four bits of %rsi == 1111
669
670LMod15:
671 movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
6721: // loop over 64-byte chunks
673 movdqa 1(%rsi,%rcx),%xmm1
674 movdqa 17(%rsi,%rcx),%xmm2
675 movdqa 33(%rsi,%rcx),%xmm3
676 movdqa 49(%rsi,%rcx),%xmm4
677
678 movdqa %xmm0,%xmm5
679 movdqa %xmm4,%xmm0
680
681 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
682 palignr $15,%xmm2,%xmm3
683 palignr $15,%xmm1,%xmm2
684 palignr $15,%xmm5,%xmm1
685
686 movdqa %xmm1,(%rdi,%rcx)
687 movdqa %xmm2,16(%rdi,%rcx)
688 movdqa %xmm3,32(%rdi,%rcx)
689 movdqa %xmm4,48(%rdi,%rcx)
690
691 addq $64,%rcx
692 jnz 1b
693
694 jmp LShort // copy remaining 0..63 bytes and done
695
696
697// Reverse moves. These are not optimized as aggressively as their forward
698// counterparts, as they are only used with destructive overlap.
699// rdx = length
700// rsi = source ptr
701// rdi = dest ptr
702
703LReverse:
704 addq %rdx,%rsi // point to end of strings
705 addq %rdx,%rdi
706 cmpq $(kShort),%rdx // long enough to bother with SSE?
707 ja LReverseNotShort // yes
708
709// Handle reverse short copies.
710// edx = length (<= kShort)
711// rsi = one byte past end of source
712// rdi = one byte past end of dest
713
714LReverseShort:
715 movl %edx,%ecx // copy length
716 shrl $3,%ecx // #quadwords
717 jz 3f
7181:
719 subq $8,%rsi
720 movq (%rsi),%rax
721 subq $8,%rdi
722 movq %rax,(%rdi)
723 decl %ecx
724 jnz 1b
7253:
726 andl $7,%edx // bytes?
727 jz 5f
7284:
729 decq %rsi
730 movb (%rsi),%al
731 decq %rdi
732 movb %al,(%rdi)
733 decl %edx
734 jnz 4b
7355:
736 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
737 popq %rbp
738 ret
739
740// Handle a reverse move long enough to justify using SSE.
741// rdx = length (> kShort)
742// rsi = one byte past end of source
743// rdi = one byte past end of dest
744
745LReverseNotShort:
746 movl %edi,%ecx // copy destination
747 andl $15,%ecx // get #bytes to align destination
748 je LReverseDestAligned // already aligned
749 subq %rcx,%rdx // adjust length
7501: // loop copying 1..15 bytes
751 decq %rsi
752 movb (%rsi),%al
753 decq %rdi
754 movb %al,(%rdi)
755 decl %ecx
756 jnz 1b
757
758// Destination is now aligned. Prepare for reverse loops.
759
760LReverseDestAligned:
761 movq %rdx,%rcx // copy length
762 andl $63,%edx // get remaining bytes for LReverseShort
763 andq $-64,%rcx // get number of bytes we will copy in inner loop
764 subq %rcx,%rsi // point to endpoint of copy
765 subq %rcx,%rdi
766 testl $15,%esi // is source aligned too?
767 jnz LReverseUnalignedLoop // no
768
769LReverseAlignedLoop: // loop over 64-byte chunks
770 movdqa -16(%rsi,%rcx),%xmm0
771 movdqa -32(%rsi,%rcx),%xmm1
772 movdqa -48(%rsi,%rcx),%xmm2
773 movdqa -64(%rsi,%rcx),%xmm3
774
775 movdqa %xmm0,-16(%rdi,%rcx)
776 movdqa %xmm1,-32(%rdi,%rcx)
777 movdqa %xmm2,-48(%rdi,%rcx)
778 movdqa %xmm3,-64(%rdi,%rcx)
779
780 subq $64,%rcx
781 jne LReverseAlignedLoop
782
783 jmp LReverseShort // copy remaining 0..63 bytes and done
784
785
786// Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
787
788LReverseUnalignedLoop: // loop over 64-byte chunks
789 movdqu -16(%rsi,%rcx),%xmm0
790 movdqu -32(%rsi,%rcx),%xmm1
791 movdqu -48(%rsi,%rcx),%xmm2
792 movdqu -64(%rsi,%rcx),%xmm3
793
794 movdqa %xmm0,-16(%rdi,%rcx)
795 movdqa %xmm1,-32(%rdi,%rcx)
796 movdqa %xmm2,-48(%rdi,%rcx)
797 movdqa %xmm3,-64(%rdi,%rcx)
798
799 subq $64,%rcx
800 jne LReverseUnalignedLoop
801
802 jmp LReverseShort // copy remaining 0..63 bytes and done
803
804PLATFUNC_DESCRIPTOR(bcopy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
805PLATFUNC_DESCRIPTOR(memcpy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
806PLATFUNC_DESCRIPTOR(memmove,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)